List of usage examples for org.apache.lucene.util BytesRef BytesRef
public BytesRef()
From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION)); CommandLine cmdline = null;/*from w ww. j av a 2 s . c om*/ CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options); System.exit(-1); } String indexLocation = cmdline.getOptionValue(INDEX_OPTION); int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1; PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name); TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); long missingCnt = 0; int skippedTerms = 0; BytesRef bytes = new BytesRef(); while ((bytes = termsEnum.next()) != null) { byte[] buf = new byte[bytes.length]; System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length); String term = new String(buf, "UTF-8"); int df = termsEnum.docFreq(); long cf = termsEnum.totalTermFreq(); if (df < min) { skippedTerms++; missingCnt += cf; continue; } out.println(term + "\t" + df + "\t" + cf); } reader.close(); out.close(); System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt); }
From source file:com.kmwllc.search.graph.GraphTermsCollector.java
private void addEdgeIdsToResult(int doc) throws IOException { // set the doc to pull the edges ids for. docTermOrds.setDocument(doc);//from w w w. ja va2s . co m // TODO: why is this final? BytesRef scratch = new BytesRef(); long ord; while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { scratch = docTermOrds.lookupOrd(ord); // add the edge id to the collector terms. // TODO: how do we handle non-string type fields? // do i need to worry about that here? collectorTerms.add(scratch); } }
From source file:com.lucid.solr.sidecar.SidecarIndexReaderFactory.java
License:Apache License
DirectoryReader buildParallelReader(DirectoryReader main, SolrIndexSearcher source, boolean rebuild) { try {/* w ww. ja v a 2s . com*/ if (source == null) { throw new Exception("Source collection is missing."); } // create as a sibling path of the main index Directory d = main.directory(); File primaryDir = null; if (d instanceof FSDirectory) { String path = ((FSDirectory) d).getDirectory().getPath(); primaryDir = new File(path); sidecarIndex = new File(primaryDir.getParentFile(), sidecarIndexLocation); } else { String secondaryPath = System.getProperty("java.io.tmpdir") + File.separator + sidecarIndexLocation + "-" + System.currentTimeMillis(); sidecarIndex = new File(secondaryPath); } // create a new tmp dir for the secondary indexes File secondaryIndex = new File(sidecarIndex, System.currentTimeMillis() + "-index"); if (rebuild) { safeDelete(sidecarIndex); } parallelFields.addAll(source.getFieldNames()); parallelFields.remove("id"); LOG.debug("building a new index"); Directory dir = FSDirectory.open(secondaryIndex); if (IndexWriter.isLocked(dir)) { // try forcing unlock try { IndexWriter.unlock(dir); } catch (Exception e) { LOG.warn("Failed to unlock " + secondaryIndex); } } int[] mergeTargets; AtomicReader[] subReaders = SidecarIndexReader.getSequentialSubReaders(main); if (subReaders == null || subReaders.length == 0) { mergeTargets = new int[] { main.maxDoc() }; } else { mergeTargets = new int[subReaders.length]; for (int i = 0; i < subReaders.length; i++) { mergeTargets[i] = subReaders[i].maxDoc(); } } Version ver = currentCore.getLatestSchema().getDefaultLuceneMatchVersion(); IndexWriterConfig cfg = new IndexWriterConfig(ver, currentCore.getLatestSchema().getAnalyzer()); //cfg.setInfoStream(System.err); cfg.setMergeScheduler(new SerialMergeScheduler()); cfg.setMergePolicy(new SidecarMergePolicy(mergeTargets, false)); IndexWriter iw = new IndexWriter(dir, cfg); LOG.info("processing " + main.maxDoc() + " docs / " + main.numDeletedDocs() + " dels in main index"); int boostedDocs = 0; Bits live = MultiFields.getLiveDocs(main); int targetPos = 0; int nextTarget = mergeTargets[targetPos]; BytesRef idRef = new BytesRef(); for (int i = 0; i < main.maxDoc(); i++) { if (i == nextTarget) { iw.commit(); nextTarget = nextTarget + mergeTargets[++targetPos]; } if (live != null && !live.get(i)) { addDummy(iw); // this is required to preserve doc numbers. continue; } else { DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(docIdField); main.document(i, visitor); Document doc = visitor.getDocument(); // get docId String id = doc.get(docIdField); if (id == null) { LOG.debug("missing id, docNo=" + i); addDummy(iw); continue; } else { // find the data, if any doc = lookup(source, id, idRef, parallelFields); if (doc == null) { LOG.debug("missing boost data, docId=" + id); addDummy(iw); continue; } else { LOG.debug("adding boost data, docId=" + id + ", b=" + doc); iw.addDocument(doc); boostedDocs++; } } } } iw.close(); DirectoryReader other = DirectoryReader.open(dir); LOG.info("SidecarIndexReader with " + boostedDocs + " boosted documents."); SidecarIndexReader pr = createSidecarIndexReader(main, other, sourceCollection, secondaryIndex); return pr; } catch (Exception e) { LOG.warn("Unable to build parallel index: " + e.toString(), e); LOG.warn("Proceeding with single main index."); try { return new SidecarIndexReader(this, main, null, SidecarIndexReader.getSequentialSubReaders(main), sourceCollection, null); } catch (Exception e1) { LOG.warn("Unexpected exception, returning single main index", e1); return main; } } }
From source file:com.lucure.core.codec.CompressingStoredFieldsReader.java
License:Apache License
/** Sole constructor. */ public CompressingStoredFieldsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn, IOContext context, String formatName, CompressionMode compressionMode) throws IOException { this.compressionMode = compressionMode; final String segment = si.name; boolean success = false; fieldInfos = fn;/*from ww w. j a va2 s .c om*/ numDocs = si.getDocCount(); ChecksumIndexInput indexStream = null; try { final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION); final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION); // Load the index into memory indexStream = d.openChecksumInput(indexStreamFN, context); final String codecNameIdx = formatName + CODEC_SFX_IDX; version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT); assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer(); indexReader = new CompressingStoredFieldsIndexReader(indexStream, si); long maxPointer = -1; if (version >= VERSION_CHECKSUM) { maxPointer = indexStream.readVLong(); CodecUtil.checkFooter(indexStream); } else { CodecUtil.checkEOF(indexStream); } indexStream.close(); indexStream = null; // Open the data file and read metadata fieldsStream = d.openInput(fieldsStreamFN, context); if (version >= VERSION_CHECKSUM) { if (maxPointer + CodecUtil.footerLength() != fieldsStream.length()) { throw new CorruptIndexException("Invalid fieldsStream maxPointer (file truncated?): maxPointer=" + maxPointer + ", length=" + fieldsStream.length()); } } else { maxPointer = fieldsStream.length(); } this.maxPointer = maxPointer; final String codecNameDat = formatName + CODEC_SFX_DAT; final int fieldsVersion = CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT); if (version != fieldsVersion) { throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion); } assert CodecUtil.headerLength(codecNameDat) == fieldsStream.getFilePointer(); if (version >= VERSION_BIG_CHUNKS) { chunkSize = fieldsStream.readVInt(); } else { chunkSize = -1; } packedIntsVersion = fieldsStream.readVInt(); decompressor = compressionMode.newDecompressor(); this.bytes = new BytesRef(); if (version >= VERSION_CHECKSUM) { // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption // such as file truncation. CodecUtil.retrieveChecksum(fieldsStream); } success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(this, indexStream); } } }
From source file:com.lucure.core.codec.CompressingStoredFieldsReader.java
License:Apache License
@Override public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException { fieldsStream.seek(indexReader.getStartPointer(docID)); final int docBase = fieldsStream.readVInt(); final int chunkDocs = fieldsStream.readVInt(); if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs) { throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")"); }//ww w.j a v a2 s . co m final int numStoredFields, offset, length, totalLength; if (chunkDocs == 1) { numStoredFields = fieldsStream.readVInt(); offset = 0; length = fieldsStream.readVInt(); totalLength = length; } else { final int bitsPerStoredFields = fieldsStream.readVInt(); if (bitsPerStoredFields == 0) { numStoredFields = fieldsStream.readVInt(); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException( "bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")"); } else { final long filePointer = fieldsStream.getFilePointer(); final PackedInts.Reader reader = PackedInts.getDirectReaderNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields); numStoredFields = (int) (reader.get(docID - docBase)); fieldsStream.seek(filePointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields)); } final int bitsPerLength = fieldsStream.readVInt(); if (bitsPerLength == 0) { length = fieldsStream.readVInt(); offset = (docID - docBase) * length; totalLength = chunkDocs * length; } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException( "bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")"); } else { final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1); int off = 0; for (int i = 0; i < docID - docBase; ++i) { off += it.next(); } offset = off; length = (int) it.next(); off += length; for (int i = docID - docBase + 1; i < chunkDocs; ++i) { off += it.next(); } totalLength = off; } } if ((length == 0) != (numStoredFields == 0)) { throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")"); } if (numStoredFields == 0) { // nothing to do return; } final DataInput documentInput; if (version >= VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize) { assert chunkSize > 0; assert offset < chunkSize; decompressor.decompress(fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes); documentInput = new DataInput() { int decompressed = bytes.length; void fillBuffer() throws IOException { assert decompressed <= length; if (decompressed == length) { throw new EOFException(); } final int toDecompress = Math.min(length - decompressed, chunkSize); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes); decompressed += toDecompress; } @Override public byte readByte() throws IOException { if (bytes.length == 0) { fillBuffer(); } --bytes.length; return bytes.bytes[bytes.offset++]; } @Override public void readBytes(byte[] b, int offset, int len) throws IOException { while (len > bytes.length) { System.arraycopy(bytes.bytes, bytes.offset, b, offset, bytes.length); len -= bytes.length; offset += bytes.length; fillBuffer(); } System.arraycopy(bytes.bytes, bytes.offset, b, offset, len); bytes.offset += len; bytes.length -= len; } }; } else { final BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef(); decompressor.decompress(fieldsStream, totalLength, offset, length, bytes); assert bytes.length == length; documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); } for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) { final long infoAndBits = documentInput.readVLong(); final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS); final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); final int bits = (int) (infoAndBits & TYPE_MASK); assert bits <= NUMERIC_DOUBLE : "bits=" + Integer.toHexString(bits); //get restricted FieldVisibility cv = RestrictedStoredFieldVisitor.EMPTY; boolean isRestricted = documentInput.readByte() == 1; if (isRestricted) { int cv_length = documentInput.readVInt(); byte[] cv_bytes = new byte[cv_length]; documentInput.readBytes(cv_bytes, 0, cv_length); cv = new FieldVisibility(cv_bytes); } RestrictedStoredFieldVisitor restrictedStoredFieldVisitor = DelegatingRestrictedFieldVisitor .wrap(visitor); if (evaluate(cv)) { switch (restrictedStoredFieldVisitor.needsField(fieldInfo, cv)) { case YES: readField(documentInput, restrictedStoredFieldVisitor, fieldInfo, bits, cv); break; case NO: skipField(documentInput, bits, cv); break; case STOP: return; } } else { skipField(documentInput, bits, cv); } } }
From source file:com.rocana.lucene.codec.v1.RocanaBlockTreeTermsReader.java
License:Apache License
private static BytesRef readBytesRef(IndexInput in) throws IOException { BytesRef bytes = new BytesRef(); bytes.length = in.readVInt();//from w w w .jav a 2s.co m bytes.bytes = new byte[bytes.length]; in.readBytes(bytes.bytes, 0, bytes.length); return bytes; }
From source file:com.rondhuit.w2v.lucene.LuceneIndexCorpus.java
License:Apache License
@Override public void learnVocab() throws IOException { super.learnVocab(); final String field = ((LuceneIndexConfig) config).getField(); final Terms terms = MultiFields.getTerms(reader, field); final BytesRef maxTerm = terms.getMax(); final BytesRef minTerm = terms.getMin(); Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true); IndexSearcher searcher = new IndexSearcher(reader); topDocs = searcher.search(q, Integer.MAX_VALUE); TermsEnum termsEnum = null;/* ww w. j a v a 2 s. com*/ termsEnum = terms.iterator(termsEnum); termsEnum.seekCeil(new BytesRef()); BytesRef term = termsEnum.term(); while (term != null) { int p = addWordToVocab(term.utf8ToString()); vocab[p].setCn((int) termsEnum.totalTermFreq()); term = termsEnum.next(); } }
From source file:com.shaie.PhraseVsSpanQuery.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Directory dir = new RAMDirectory(); final IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); final IndexWriter writer = new IndexWriter(dir, conf); final Document doc = new Document(); doc.add(new TextField("f", new TokenStream() { final PositionIncrementAttribute pos = addAttribute(PositionIncrementAttribute.class); final CharTermAttribute term = addAttribute(CharTermAttribute.class); boolean first = true, done = false; @Override// ww w .j av a2 s . co m public boolean incrementToken() throws IOException { if (done) { return false; } if (first) { term.setEmpty().append("a"); pos.setPositionIncrement(1); first = false; } else { term.setEmpty().append("b"); pos.setPositionIncrement(0); done = true; } return true; } })); writer.addDocument(doc); writer.close(); final DirectoryReader reader = DirectoryReader.open(dir); final IndexSearcher searcher = new IndexSearcher(reader); final LeafReader ar = reader.leaves().get(0).reader(); final TermsEnum te = ar.terms("f").iterator(); BytesRef scratch = new BytesRef(); while ((scratch = te.next()) != null) { System.out.println(scratch.utf8ToString()); final PostingsEnum dape = ar.postings(new Term("f", scratch.utf8ToString())); System.out.println(" doc=" + dape.nextDoc() + ", pos=" + dape.nextPosition()); } System.out.println(); // try a phrase query with a slop final PhraseQuery pqNoSlop = buildPhraseQuery(0); System.out.println("searching for \"a b\"; num results = " + searcher.search(pqNoSlop, 10).totalHits); final PhraseQuery pqSlop1 = buildPhraseQuery(1); System.out.println("searching for \"a b\"~1; num results = " + searcher.search(pqSlop1, 10).totalHits); final PhraseQuery pqSlop3 = buildPhraseQuery(3); System.out.println("searching for \"a b\"~3; num results = " + searcher.search(pqSlop3, 10).totalHits); final SpanNearQuery snqUnOrdered = new SpanNearQuery( new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1, false); System.out.println("searching for SpanNearUnordered('a', 'b'), slop=1; num results = " + searcher.search(snqUnOrdered, 10).totalHits); final SpanNearQuery snqOrdered = new SpanNearQuery( new SpanQuery[] { new SpanTermQuery(new Term("f", "a")), new SpanTermQuery(new Term("f", "b")) }, 1, true); System.out.println("searching for SpanNearOrdered('a', 'b'), slop=1; num results = " + searcher.search(snqOrdered, 10).totalHits); reader.close(); }
From source file:com.sindicetech.siren.search.node.NodeConstantScoreAutoRewrite.java
License:Open Source License
@Override public Query rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { // Disabled cutoffs final int docCountCutoff = Integer.MAX_VALUE; final int termCountLimit = Integer.MAX_VALUE; final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); this.collectTerms(reader, query, col); final int size = col.pendingTerms.size(); if (col.hasCutOff) { return MultiNodeTermQuery.CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); } else if (size == 0) { return this.getTopLevelQuery(query); } else {//from w w w. j ava2 s.c om final NodeBooleanQuery bq = this.getTopLevelQuery(query); final BytesRefHash pendingTerms = col.pendingTerms; final int sort[] = pendingTerms.sort(col.termsEnum.getComparator()); for (int i = 0; i < size; i++) { final int pos = sort[i]; // docFreq is not used for constant score here, we pass 1 // to explicitely set a fake value, so it's not calculated this.addClause(bq, new Term(query.field, pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]); } // Strip scores final NodeQuery result = new NodeConstantScoreQuery(bq); result.setBoost(query.getBoost()); return result; } }
From source file:com.sindicetech.siren.search.node.NodeScoringRewrite.java
License:Open Source License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final Q result = this.getTopLevelQuery(query); final ParallelArraysTermCollector col = new ParallelArraysTermCollector(); this.collectTerms(reader, query, col); final int size = col.terms.size(); if (size > 0) { final int sort[] = col.terms.sort(col.termsEnum.getComparator()); final float[] boost = col.array.boost; final TermContext[] termStates = col.array.termState; for (int i = 0; i < size; i++) { final int pos = sort[i]; final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); assert reader.docFreq(term) == termStates[pos].docFreq(); this.addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos], termStates[pos]);// ww w . j a v a2 s. co m } } return result; }