Java tutorial
package com.senseidb.abacus.api.codec; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.BitSet; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Random; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene42.Lucene42Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ public class CodecTest { static final String FIELD = "test"; static Directory buildIndex(Iterable<String> datasrc, Codec codec) throws Exception { String idxname = codec == null ? "lucene" : codec.getName(); Directory dir = FSDirectory.open(new File("/tmp/codectest", idxname));//new RAMDirectory(); //Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)); conf.setUseCompoundFile(false); if (codec != null) { conf.setCodec(codec); } IndexWriter writer = new IndexWriter(dir, conf); for (String doc : datasrc) { if (doc == null) break; doc = doc.trim(); if (doc.length() == 0) continue; Document d = new Document(); FieldType ft = new FieldType(); ft.setIndexed(true); ft.setStored(false); ft.setIndexOptions(IndexOptions.DOCS_ONLY); ft.setOmitNorms(true); Field f = new Field(FIELD, doc, ft); d.add(f); writer.addDocument(d); } writer.forceMerge(1); writer.commit(); writer.close(); return dir; } static Iterable<String> buildDataSrc(File f) throws IOException { final int maxCount = 1000; final BufferedReader freader = new BufferedReader(new FileReader(f)); ArrayList<String> list = new ArrayList<String>(maxCount); while (list.size() < maxCount) { String line = freader.readLine(); if (line == null) break; if (line.trim().length() > 0) { list.add(line); } } freader.close(); return list; } static void testThreaded(int numThreads, final int numIter, final AtomicReader reader, final String field) { Runnable runnable = new Runnable() { public void run() { try { Fields f = reader.fields(); Terms t = f.terms(field); TermsEnum te = t.iterator(null); ArrayList<BytesRef> termList = new ArrayList<BytesRef>(); BytesRef termText; while ((termText = te.next()) != null) { termList.add(termText); } Random rand = new Random(); for (int i = 0; i < numIter; ++i) { int idx = rand.nextInt(termList.size()); termText = termList.get(idx); te = t.iterator(null); te.seekCeil(termText); DocsEnum de = te.docs(null, null); int doc; while ((doc = de.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { } de = te.docs(null, null); doc = -1; while ((doc = de.advance(doc + 2)) != DocIdSetIterator.NO_MORE_DOCS) { } } } catch (Exception e) { e.printStackTrace(); } } }; Thread[] threads = new Thread[numThreads]; for (int i = 0; i < numThreads; ++i) { threads[i] = new Thread(runnable); } for (int i = 0; i < numThreads; ++i) { threads[i].start(); } for (int i = 0; i < numThreads; ++i) { try { threads[i].join(); } catch (InterruptedException e) { e.printStackTrace(); } } } static Iterable<String> buildTweetsDataSource() throws IOException { File inFile = new File("/Users/johnwang/bitbucket/lucene-solr/lucene/codecs/src/resources/tweets2k.txt"); return buildDataSrc(inFile); } static Iterable<String> buildLongTermSrc(int numUnique, int numDocs) throws IOException { Random rand = new Random(); HashSet<Long> uniques = new HashSet<Long>(); while (uniques.size() < numUnique) { long v = Math.abs(rand.nextLong()); uniques.add(v); } Long[] uniqueArr = uniques.toArray(new Long[0]); System.out.println("unique values prepared"); List<String> termArr = new LinkedList<String>(); DecimalFormat format = new DecimalFormat("000000000000000"); BitSet bs = new BitSet(); for (int i = 0; i < numDocs; ++i) { int n = rand.nextInt(uniqueArr.length); String term = format.format(uniqueArr[n].longValue()); termArr.add(term); if (i % 10000 == 0) { System.out.println(i + " docs prepared"); } } return termArr; } public static void main(String[] args) throws Exception { Codec codec = new Lucene42Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return new AbacusFacetPostingsFormat(); } }; Iterable<String> datasrc = buildTweetsDataSource(); System.out.println("dataset prepared"); Directory codecIndex = buildIndex(datasrc, codec); System.out.println("new codec indexed"); Directory luceneIndex = buildIndex(datasrc, null); System.out.println("luncene default codec indexed"); //IndexComparator.showIndex(luceneIndex); //IndexComparator.showIndex(codecIndex); boolean same = IndexComparator.compareReaders(luceneIndex, codecIndex); if (!same) { System.out.println("comparison failed"); } else { System.out.println("comparison success"); } } }