List of usage examples for org.apache.lucene.util BytesRef bytesEquals
public boolean bytesEquals(BytesRef other)
From source file:de.unihildesheim.iw.lucene.util.BytesRefUtilsTest.java
License:Open Source License
@Test public void testCopyBytes() throws Exception { final BytesRef br = new BytesRef("foo"); final BytesRef result = new BytesRef(BytesRefUtils.copyBytes(br)); Assert.assertTrue("Bytes mismatch.", br.bytesEquals(result)); }
From source file:de.unihildesheim.iw.lucene.util.BytesRefUtilsTest.java
License:Open Source License
@Test public void testCopyBytes_empty() throws Exception { final BytesRef br = new BytesRef(); final BytesRef result = new BytesRef(BytesRefUtils.copyBytes(br)); Assert.assertTrue("Bytes mismatch.", br.bytesEquals(result)); }
From source file:org.codelibs.elasticsearch.common.util.BytesRefHash.java
License:Apache License
/** * Get the id associated with <code>key</code> *//* ww w . j av a 2 s .co m*/ public long find(BytesRef key, int code) { final long slot = slot(rehash(code), mask); for (long index = slot;; index = nextSlot(index, mask)) { final long id = id(index); if (id == -1L || key.bytesEquals(get(id, spare))) { return id; } } }
From source file:org.codelibs.elasticsearch.common.util.BytesRefHash.java
License:Apache License
private long set(BytesRef key, int code, long id) { assert rehash(key.hashCode()) == code; assert size < maxSize; final long slot = slot(code, mask); for (long index = slot;; index = nextSlot(index, mask)) { final long curId = id(index); if (curId == -1) { // means unset id(index, id);/*w ww . j av a2s . co m*/ append(id, key, code); ++size; return id; } else if (key.bytesEquals(get(curId, spare))) { return -1 - curId; } } }
From source file:org.elasticsearch.common.lucene.search.XTermsFilter.java
License:Apache License
private XTermsFilter(FieldAndTermEnum iter, int length) { // TODO: maybe use oal.index.PrefixCodedTerms instead? // If number of terms is more than a few hundred it // should be a win // TODO: we also pack terms in FieldCache/DocValues // ... maybe we can refactor to share that code // TODO: yet another option is to build the union of the terms in // an automaton an call intersect on the termsenum if the density is high int hash = 9; byte[] serializedTerms = new byte[0]; this.offsets = new int[length + 1]; int lastEndOffset = 0; int index = 0; ArrayList<TermsAndField> termsAndFields = new ArrayList<TermsAndField>(); TermsAndField lastTermsAndField = null; BytesRef previousTerm = null; String previousField = null;/* ww w .j a v a 2 s .c o m*/ BytesRef currentTerm; String currentField; while ((currentTerm = iter.next()) != null) { currentField = iter.field(); if (currentField == null) { throw new IllegalArgumentException("Field must not be null"); } if (previousField != null) { // deduplicate if (previousField.equals(currentField)) { if (previousTerm.bytesEquals(currentTerm)) { continue; } } else { final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end; lastTermsAndField = new TermsAndField(start, index, previousField); termsAndFields.add(lastTermsAndField); } } hash = PRIME * hash + currentField.hashCode(); hash = PRIME * hash + currentTerm.hashCode(); if (serializedTerms.length < lastEndOffset + currentTerm.length) { serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset + currentTerm.length); } System.arraycopy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length); offsets[index] = lastEndOffset; lastEndOffset += currentTerm.length; index++; previousTerm = currentTerm; previousField = currentField; } offsets[index] = lastEndOffset; final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end; lastTermsAndField = new TermsAndField(start, index, previousField); termsAndFields.add(lastTermsAndField); this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset); this.termsAndFields = termsAndFields.toArray(new TermsAndField[termsAndFields.size()]); this.hashCode = hash; }
From source file:org.elasticsearch.index.translog.TranslogHeader.java
License:Apache License
/** * Read a translog header from the given path and file channel *//*from w ww . ja v a2s . c o m*/ static TranslogHeader read(final String translogUUID, final Path path, final FileChannel channel) throws IOException { // This input is intentionally not closed because closing it will close the FileChannel. final BufferedChecksumStreamInput in = new BufferedChecksumStreamInput( new InputStreamStreamInput(java.nio.channels.Channels.newInputStream(channel), channel.size()), path.toString()); final int version; try { version = CodecUtil.checkHeader(new InputStreamDataInput(in), TRANSLOG_CODEC, VERSION_CHECKSUMS, VERSION_PRIMARY_TERM); } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException e) { tryReportOldVersionError(path, channel); throw new TranslogCorruptedException(path.toString(), "translog header corrupted", e); } if (version == VERSION_CHECKSUMS) { throw new IllegalStateException("pre-2.0 translog found [" + path + "]"); } // Read the translogUUID final int uuidLen = in.readInt(); if (uuidLen > channel.size()) { throw new TranslogCorruptedException(path.toString(), "UUID length can't be larger than the translog"); } final BytesRef uuid = new BytesRef(uuidLen); uuid.length = uuidLen; in.read(uuid.bytes, uuid.offset, uuid.length); final BytesRef expectedUUID = new BytesRef(translogUUID); if (uuid.bytesEquals(expectedUUID) == false) { throw new TranslogCorruptedException(path.toString(), "expected shard UUID " + expectedUUID + " but got: " + uuid + " this translog file belongs to a different translog"); } // Read the primary term final long primaryTerm; if (version == VERSION_PRIMARY_TERM) { primaryTerm = in.readLong(); assert primaryTerm >= 0 : "Primary term must be non-negative [" + primaryTerm + "]; translog path [" + path + "]"; } else { assert version == VERSION_CHECKPOINTS : "Unknown header version [" + version + "]"; primaryTerm = UNKNOWN_PRIMARY_TERM; } // Verify the checksum if (version >= VERSION_PRIMARY_TERM) { Translog.verifyChecksum(in); } final int headerSizeInBytes = headerSizeInBytes(version, uuid.length); assert channel.position() == headerSizeInBytes : "Header is not fully read; header size [" + headerSizeInBytes + "], position [" + channel.position() + "]"; return new TranslogHeader(translogUUID, primaryTerm, headerSizeInBytes); }
From source file:org.elasticsearch.index.translog.TranslogReader.java
License:Apache License
/** * Given a file, return a VersionedTranslogStream based on an * optionally-existing header in the file. If the file does not exist, or * has zero length, returns the latest version. If the header does not * exist, assumes Version 0 of the translog file format. * <p/>//from w w w . j a v a 2 s . c o m * * @throws IOException */ public static ImmutableTranslogReader open(ChannelReference channelReference, Checkpoint checkpoint, String translogUUID) throws IOException { final FileChannel channel = channelReference.getChannel(); final Path path = channelReference.getPath(); assert channelReference.getGeneration() == checkpoint.generation : "expected generation: " + channelReference.getGeneration() + " but got: " + checkpoint.generation; try { if (checkpoint.offset == 0 && checkpoint.numOps == TranslogReader.UNKNOWN_OP_COUNT) { // only old files can be empty return new LegacyTranslogReader(channelReference.getGeneration(), channelReference, 0); } InputStreamStreamInput headerStream = new InputStreamStreamInput(Channels.newInputStream(channel)); // don't close // Lucene's CodecUtil writes a magic number of 0x3FD76C17 with the // header, in binary this looks like: // // binary: 0011 1111 1101 0111 0110 1100 0001 0111 // hex : 3 f d 7 6 c 1 7 // // With version 0 of the translog, the first byte is the // Operation.Type, which will always be between 0-4, so we know if // we grab the first byte, it can be: // 0x3f => Lucene's magic number, so we can assume it's version 1 or later // 0x00 => version 0 of the translog // // otherwise the first byte of the translog is corrupted and we // should bail byte b1 = headerStream.readByte(); if (b1 == LUCENE_CODEC_HEADER_BYTE) { // Read 3 more bytes, meaning a whole integer has been read byte b2 = headerStream.readByte(); byte b3 = headerStream.readByte(); byte b4 = headerStream.readByte(); // Convert the 4 bytes that were read into an integer int header = ((b1 & 0xFF) << 24) + ((b2 & 0xFF) << 16) + ((b3 & 0xFF) << 8) + ((b4 & 0xFF) << 0); // We confirm CodecUtil's CODEC_MAGIC number (0x3FD76C17) // ourselves here, because it allows us to read the first // byte separately if (header != CodecUtil.CODEC_MAGIC) { throw new TranslogCorruptedException( "translog looks like version 1 or later, but has corrupted header"); } // Confirm the rest of the header using CodecUtil, extracting // the translog version int version = CodecUtil.checkHeaderNoMagic(new InputStreamDataInput(headerStream), TranslogWriter.TRANSLOG_CODEC, 1, Integer.MAX_VALUE); switch (version) { case TranslogWriter.VERSION_CHECKSUMS: assert checkpoint.numOps == TranslogReader.UNKNOWN_OP_COUNT : "expected unknown op count but got: " + checkpoint.numOps; assert checkpoint.offset == Files.size(path) : "offset(" + checkpoint.offset + ") != file_size(" + Files.size(path) + ") for: " + path; // legacy - we still have to support it somehow return new LegacyTranslogReaderBase(channelReference.getGeneration(), channelReference, CodecUtil.headerLength(TranslogWriter.TRANSLOG_CODEC), checkpoint.offset); case TranslogWriter.VERSION_CHECKPOINTS: assert path.getFileName().toString() .endsWith(Translog.TRANSLOG_FILE_SUFFIX) : "new file ends with old suffix: " + path; assert checkpoint.numOps > TranslogReader.UNKNOWN_OP_COUNT : "expected at least 0 operatin but got: " + checkpoint.numOps; assert checkpoint.offset <= channel.size() : "checkpoint is inconsistent with channel length: " + channel.size() + " " + checkpoint; int len = headerStream.readInt(); if (len > channel.size()) { throw new TranslogCorruptedException("uuid length can't be larger than the translog"); } BytesRef ref = new BytesRef(len); ref.length = len; headerStream.read(ref.bytes, ref.offset, ref.length); BytesRef uuidBytes = new BytesRef(translogUUID); if (uuidBytes.bytesEquals(ref) == false) { throw new TranslogCorruptedException("expected shard UUID [" + uuidBytes + "] but got: [" + ref + "] this translog file belongs to a different translog"); } return new ImmutableTranslogReader(channelReference.getGeneration(), channelReference, ref.length + CodecUtil.headerLength(TranslogWriter.TRANSLOG_CODEC) + RamUsageEstimator.NUM_BYTES_INT, checkpoint.offset, checkpoint.numOps); default: throw new TranslogCorruptedException( "No known translog stream version: " + version + " path:" + path); } } else if (b1 == UNVERSIONED_TRANSLOG_HEADER_BYTE) { assert checkpoint.numOps == TranslogReader.UNKNOWN_OP_COUNT : "expected unknown op count but got: " + checkpoint.numOps; assert checkpoint.offset == Files.size(path) : "offset(" + checkpoint.offset + ") != file_size(" + Files.size(path) + ") for: " + path; return new LegacyTranslogReader(channelReference.getGeneration(), channelReference, checkpoint.offset); } else { throw new TranslogCorruptedException("Invalid first byte in translog file, got: " + Long.toHexString(b1) + ", expected 0x00 or 0x3f"); } } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException e) { throw new TranslogCorruptedException("Translog header corrupted", e); } }
From source file:org.elasticsearch.search.aggregations.support.FieldDataSourceTests.java
License:Apache License
private static void assertConsistent(BytesValues values) { for (int i = 0; i < 10; ++i) { final int valueCount = values.setDocument(i); for (int j = 0; j < valueCount; ++j) { final BytesRef term = values.nextValue(); assertEquals(term.hashCode(), values.currentValueHash()); assertTrue(term.bytesEquals(values.copyShared())); }//ww w .java2s . c om } }
From source file:org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java
License:Apache License
protected void postFilter(final Candidate candidate, final CharsRef spare, BytesRef byteSpare, final List<Candidate> candidates) throws IOException { if (postFilter == null) { candidates.add(candidate);//from w w w. j ava2s .c o m } else { final BytesRef result = byteSpare; SuggestUtils.analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() { @Override public void nextToken() throws IOException { this.fillBytesRef(result); if (posIncAttr.getPositionIncrement() > 0 && result.bytesEquals(candidate.term)) { BytesRef term = BytesRef.deepCopyOf(result); long freq = frequency(term); candidates.add(new Candidate(BytesRef.deepCopyOf(term), freq, candidate.stringDistance, score(candidate.frequency, candidate.stringDistance, dictSize), false)); } else { candidates.add( new Candidate(BytesRef.deepCopyOf(result), candidate.frequency, nonErrorLikelihood, score(candidate.frequency, candidate.stringDistance, dictSize), false)); } } }, spare); } }
From source file:org.nlp4l.lucene.BuddyWordsFinder.java
License:Apache License
public Scorer[] findCoincidentalTerms(String field, BytesRef term) throws IOException { baseTermFilter.start(reader, field, term); if (baseTermFilter.skip(term) || baseTermFilter.skipByPopularity(term)) return null; //System.out.println(term.utf8ToString()); Bits liveDocs = MultiFields.getLiveDocs(reader); PostingsEnum de = MultiFields.getTermDocsEnum(reader, field, term); if (de == null) return null; int numDocsAnalyzed = 0; phraseTerms.clear();/*from ww w . j a v a 2s .c om*/ while (de.nextDoc() != PostingsEnum.NO_MORE_DOCS && numDocsAnalyzed < maxDocsToAnalyze) { int docId = de.docID(); //first record all of the positions of the term in a bitset which // represents terms in the current doc. int freq = de.freq(); PostingsEnum pe = MultiFields.getTermPositionsEnum(reader, field, term); int ret = pe.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; termPos.clear(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < termPos.size()) termPos.set(pos); } // now look at all OTHER terms in this doc and see if they are // positioned in a pre-defined sized window around the current term Fields vectors = reader.getTermVectors(docId); // check it has term vectors if (vectors == null) return null; Terms vector = vectors.terms(field); // check it has position info if (vector == null || !vector.hasPositions()) return null; TermsEnum te = vector.iterator(); BytesRef otherTerm = null; while ((otherTerm = te.next()) != null) { if (term.bytesEquals(otherTerm)) continue; coiTermFilter.start(reader, field, otherTerm); if (coiTermFilter.skip(otherTerm) || coiTermFilter.skipByPopularity(otherTerm)) continue; PostingsEnum pe2 = MultiFields.getTermPositionsEnum(reader, field, otherTerm); ret = pe2.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; freq = pe2.freq(); boolean matchFound = false; for (int i = 0; i < freq && (!matchFound); i++) { int pos = pe2.nextPosition(); int startpos = Math.max(0, pos - slop); int endpos = pos + slop; for (int prevpos = startpos; (prevpos <= endpos) && (!matchFound); prevpos++) { if (termPos.get(prevpos)) { // Add term to hashmap containing co-occurence // counts for this term Scorer pt = phraseTerms.get(otherTerm.utf8ToString()); if (pt == null) { pt = new Scorer(baseTermFilter.getCurrentTermDocFreq(), otherTerm.utf8ToString(), coiTermFilter.getCurrentTermDocFreq()); phraseTerms.put(pt.coiTerm, pt); } pt.incCoiDocCount(); matchFound = true; } } } } numDocsAnalyzed++; } // end of while loop // now sort and dump the top terms associated with this term. TopTerms topTerms = new TopTerms(maxCoiTermsPerTerm); for (String key : phraseTerms.keySet()) { Scorer pt = phraseTerms.get(key); topTerms.insertWithOverflow(pt); } Scorer[] tops = new Scorer[topTerms.size()]; int tp = tops.length - 1; while (topTerms.size() > 0) { Scorer top = topTerms.pop(); tops[tp--] = top; } return tops; }