List of usage examples for org.apache.lucene.index IndexWriterConfig setCodec
public IndexWriterConfig setCodec(Codec codec)
From source file:alix.lucene.Alix.java
License:Open Source License
/** * Start to scan the glob of xml files/*from w w w .ja v a 2 s. c om*/ * * @param indexDir where the lucene indexes are generated * @param anAnalyzer Analyzer to use for analyzed fields * @param similarity instance of Similarity to work with the writer * @throws TransformerConfigurationException */ static public void walk(String xmlGlob, String xslFile, String indexDir) throws IOException, TransformerConfigurationException { info("Lucene, src:" + xmlGlob + " parser:" + xslFile + " index:" + indexDir); Path srcDir = Paths.get(xmlGlob); PathMatcher glob = FileSystems.getDefault().getPathMatcher("glob:*.xml"); if (!Files.isDirectory(srcDir)) { String pattern = srcDir.getFileName().toString(); glob = FileSystems.getDefault().getPathMatcher("glob:" + pattern); srcDir = srcDir.getParent(); } if (!Files.isDirectory(srcDir)) { fatal("FATAL " + srcDir + " NOT FOUND"); } Path indexPath = Paths.get(indexDir); Files.createDirectories(indexPath); Directory dir = FSDirectory.open(indexPath); // TODO configure analyzers Analyzer analyzer = new XmlAnalyzer(); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setOpenMode(OpenMode.CREATE_OR_APPEND); conf.setSimilarity(new BM25Similarity()); conf.setCodec(new ChapitreCodec()); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // conf.setRAMBufferSizeMB(256.0); lucwriter = new IndexWriter(dir, conf); System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl"); TransformerFactory tf = TransformerFactory.newInstance(); tf.setAttribute("http://saxon.sf.net/feature/version-warning", Boolean.FALSE); tf.setAttribute("http://saxon.sf.net/feature/recoveryPolicy", new Integer(0)); parser = tf.newTransformer(new StreamSource(xslFile)); final PathMatcher matcher = glob; // transmit the matcher by a final variable to the anonymous class Files.walkFileTree(srcDir, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) { if (path.getFileName().toString().startsWith(".")) return FileVisitResult.CONTINUE; if (!matcher.matches(path.getFileName())) return FileVisitResult.CONTINUE; parse(path); return FileVisitResult.CONTINUE; } public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs) { // .git, .svn if (path.getFileName().toString().startsWith(".")) return FileVisitResult.SKIP_SUBTREE; return FileVisitResult.CONTINUE; } }); lucwriter.commit(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // lucwriter.forceMerge(1); lucwriter.close(); }
From source file:com.foundationdb.lucene.SimpleTest.java
License:Open Source License
@Test public void indexBasic() throws Exception { StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); // recreate the index on each execution config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setCodec(new FDBCodec()); FDBDirectory dir = createDirectoryForMethod(); IndexWriter writer = new IndexWriter(dir, config); try {/* w w w.j av a 2s. com*/ writer.addDocument(Arrays.asList(new TextField("title", "The title of my first document", Store.YES), new TextField("content", "The content of the first document", Store.NO))); writer.addDocument(Arrays.asList(new TextField("title", "The title of the second document", Store.YES), new TextField("content", "And this is the content", Store.NO))); } finally { writer.close(); } assertDocumentsAreThere(dir, 2); }
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();// w w w .j ava2 s . c om MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat2.java
License:Apache License
@Override public void setUp() throws Exception { super.setUp(); dir = newFSDirectory(createTempDir("testDFBlockSize")); IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); iw = new RandomIndexWriter(random(), dir, iwc); iw.setDoRandomForceMerge(false); // we will ourselves }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat2.java
License:Apache License
@Override public void tearDown() throws Exception { iw.close();//w w w . j a va 2 s. c o m TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw = new IndexWriter(dir, iwc); iw.forceMerge(1); iw.close(); dir.close(); // just force a checkindex for now super.tearDown(); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
public void test() throws Exception { Directory dir = newDirectory();/*from w w w . ja v a 2s. co m*/ Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); if (fieldName.contains("payloadsFixed")) { TokenFilter filter = new MockFixedLengthPayloadFilter(new Random(0), tokenizer, 1); return new TokenStreamComponents(tokenizer, filter); } else if (fieldName.contains("payloadsVariable")) { TokenFilter filter = new MockVariableLengthPayloadFilter(new Random(0), tokenizer); return new TokenStreamComponents(tokenizer, filter); } else { return new TokenStreamComponents(tokenizer); } } }; IndexWriterConfig iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); // TODO we could actually add more fields implemented with different PFs // or, just put this test into the usual rotation? RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsOnlyType.setStoreTermVectors(true); docsOnlyType.setIndexOptions(IndexOptions.DOCS); FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED); // turn this on for a cross-check docsAndFreqsType.setStoreTermVectors(true); docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED); // turn these on for a cross-check positionsType.setStoreTermVectors(true); positionsType.setStoreTermVectorPositions(true); positionsType.setStoreTermVectorOffsets(true); positionsType.setStoreTermVectorPayloads(true); FieldType offsetsType = new FieldType(positionsType); offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field field1 = new Field("field1docs", "", docsOnlyType); Field field2 = new Field("field2freqs", "", docsAndFreqsType); Field field3 = new Field("field3positions", "", positionsType); Field field4 = new Field("field4offsets", "", offsetsType); Field field5 = new Field("field5payloadsFixed", "", positionsType); Field field6 = new Field("field6payloadsVariable", "", positionsType); Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType); Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType); doc.add(field1); doc.add(field2); doc.add(field3); doc.add(field4); doc.add(field5); doc.add(field6); doc.add(field7); doc.add(field8); for (int i = 0; i < MAXDOC; i++) { String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + TestUtil.randomSimpleString(random()); field1.setStringValue(stringValue); field2.setStringValue(stringValue); field3.setStringValue(stringValue); field4.setStringValue(stringValue); field5.setStringValue(stringValue); field6.setStringValue(stringValue); field7.setStringValue(stringValue); field8.setStringValue(stringValue); iw.addDocument(doc); } iw.close(); verify(dir); TestUtil.checkIndex(dir); // for some extra coverage, checkIndex before we forceMerge iwc = newIndexWriterConfig(analyzer); iwc.setCodec(TestUtil.alwaysPostingsFormat(new RocanaLucene50PostingsFormat())); iwc.setOpenMode(OpenMode.APPEND); IndexWriter iw2 = new IndexWriter(dir, iwc); iw2.forceMerge(1); iw2.close(); verify(dir); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java
License:Apache License
@Test public void testChangeCodecAndMerge() throws IOException { Directory dir = newDirectory();/*from w w w . j a v a 2s . c o m*/ if (VERBOSE) { System.out.println("TEST: make new index"); } IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.CREATE) .setCodec(new MockCodec()); iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10); IndexWriter writer = newWriter(dir, iwconf); addDocs(writer, 10); writer.commit(); assertQuery(new Term("content", "aaa"), dir, 10); if (VERBOSE) { System.out.println("TEST: addDocs3"); } addDocs3(writer, 10); writer.commit(); writer.close(); assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "aaa"), dir, 10); Codec codec = iwconf.getCodec(); iwconf = newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND).setCodec(codec); //((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0); //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10); iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); iwconf.setCodec(new MockCodec2()); // uses standard for field content writer = newWriter(dir, iwconf); // swap in new codec for currently written segments if (VERBOSE) { System.out.println("TEST: add docs w/ Standard codec for content field"); } addDocs2(writer, 10); writer.commit(); codec = iwconf.getCodec(); assertEquals(30, writer.maxDoc()); assertQuery(new Term("content", "bbb"), dir, 10); assertQuery(new Term("content", "ccc"), dir, 10); //// assertQuery(new Term("content", "aaa"), dir, 10); if (VERBOSE) { System.out.println("TEST: add more docs w/ new codec"); } addDocs2(writer, 10); writer.commit(); assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "bbb"), dir, 20); assertQuery(new Term("content", "aaa"), dir, 10); assertEquals(40, writer.maxDoc()); if (VERBOSE) { System.out.println("TEST: now optimize"); } writer.forceMerge(1); assertEquals(40, writer.maxDoc()); writer.close(); assertQuery(new Term("content", "ccc"), dir, 10); assertQuery(new Term("content", "bbb"), dir, 20); assertQuery(new Term("content", "aaa"), dir, 10); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestRocanaPerFieldPostingsFormat2.java
License:Apache License
private void doTestMixedPostings(Codec codec) throws Exception { Directory dir = newDirectory();//from www . j av a 2 s.c om IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); iwc.setCodec(codec); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // turn on vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Field idField = new Field("id", "", ft); Field dateField = new Field("date", "", ft); doc.add(idField); doc.add(dateField); for (int i = 0; i < 100; i++) { idField.setStringValue(Integer.toString(random().nextInt(50))); dateField.setStringValue(Integer.toString(random().nextInt(100))); iw.addDocument(doc); } iw.close(); dir.close(); // checkindex }
From source file:com.senseidb.abacus.api.codec.CodecTest.java
License:Apache License
static Directory buildIndex(Iterable<String> datasrc, Codec codec) throws Exception { String idxname = codec == null ? "lucene" : codec.getName(); Directory dir = FSDirectory.open(new File("/tmp/codectest", idxname));//new RAMDirectory(); //Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)); conf.setUseCompoundFile(false);// w w w . j a va2 s . co m if (codec != null) { conf.setCodec(codec); } IndexWriter writer = new IndexWriter(dir, conf); for (String doc : datasrc) { if (doc == null) break; doc = doc.trim(); if (doc.length() == 0) continue; Document d = new Document(); FieldType ft = new FieldType(); ft.setIndexed(true); ft.setStored(false); ft.setIndexOptions(IndexOptions.DOCS_ONLY); ft.setOmitNorms(true); Field f = new Field(FIELD, doc, ft); d.add(f); writer.addDocument(d); } writer.forceMerge(1); writer.commit(); writer.close(); return dir; }
From source file:com.sindicetech.siren.demo.SimpleIndexer.java
License:Open Source License
private IndexWriter initializeIndexWriter() throws IOException { final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, this.initializeAnalyzer()); // Register the SIREn codec config.setCodec(new Siren10Codec()); return new IndexWriter(dir, config); }