List of usage examples for org.apache.lucene.codecs CodecUtil checkIndexHeader
public static int checkIndexHeader(DataInput in, String codec, int minVersion, int maxVersion, byte[] expectedID, String expectedSuffix) throws IOException
From source file:com.rocana.lucene.codec.v1.RocanaBlockTreeTermsReader.java
License:Apache License
/** Sole constructor. */ public RocanaBlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) throws IOException { boolean success = false; IndexInput indexIn = null;/*from w w w.j a va 2 s. c o m*/ this.postingsReader = postingsReader; this.segment = state.segmentInfo.name; String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION); try { termsIn = state.directory.openInput(termsName, state.context); version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); if (version < VERSION_AUTO_PREFIX_TERMS) { // Old (pre-5.2.0) index, no auto-prefix terms: this.anyAutoPrefixTerms = false; } else if (version == VERSION_AUTO_PREFIX_TERMS) { // 5.2.x index, might have auto-prefix terms: this.anyAutoPrefixTerms = true; } else { // 5.3.x index, we record up front if we may have written any auto-prefix terms: assert version >= VERSION_AUTO_PREFIX_TERMS_COND; byte b = termsIn.readByte(); if (b == 0) { this.anyAutoPrefixTerms = false; } else if (b == 1) { this.anyAutoPrefixTerms = true; } else { throw new CorruptIndexException("invalid anyAutoPrefixTerms: expected 0 or 1 but got " + b, termsIn); } } String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION); indexIn = state.directory.openInput(indexName, state.context); CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(), state.segmentSuffix); // IMPORTANT: comment out this one line to prevent checksumming the entire file. // This is the reason we have a custom Lucene codec and forked Lucene classes. //CodecUtil.checksumEntireFile(indexIn); // Have PostingsReader init itself postingsReader.init(termsIn, state); // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption // such as file truncation. CodecUtil.retrieveChecksum(termsIn); // Read per-field details seekDir(termsIn, dirOffset); seekDir(indexIn, indexDirOffset); final int numFields = termsIn.readVInt(); if (numFields < 0) { throw new CorruptIndexException("invalid numFields: " + numFields, termsIn); } for (int i = 0; i < numFields; ++i) { final int field = termsIn.readVInt(); final long numTerms = termsIn.readVLong(); if (numTerms <= 0) { throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn); } final int numBytes = termsIn.readVInt(); if (numBytes < 0) { throw new CorruptIndexException( "invalid rootCode for field number: " + field + ", numBytes=" + numBytes, termsIn); } final BytesRef rootCode = new BytesRef(new byte[numBytes]); termsIn.readBytes(rootCode.bytes, 0, numBytes); rootCode.length = numBytes; final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); if (fieldInfo == null) { throw new CorruptIndexException("invalid field number: " + field, termsIn); } final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? -1 : termsIn.readVLong(); final long sumDocFreq = termsIn.readVLong(); final int docCount = termsIn.readVInt(); final int longsSize = termsIn.readVInt(); if (longsSize < 0) { throw new CorruptIndexException( "invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn); } BytesRef minTerm = readBytesRef(termsIn); BytesRef maxTerm = readBytesRef(termsIn); if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs throw new CorruptIndexException( "invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn); } if (sumDocFreq < docCount) { // #postings must be >= #docs with field throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, termsIn); } if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings throw new CorruptIndexException( "invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, termsIn); } final long indexStartFP = indexIn.readVLong(); RocanaFieldReader previous = fields.put(fieldInfo.name, new RocanaFieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, indexStartFP, longsSize, indexIn, minTerm, maxTerm)); if (previous != null) { throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn); } } indexIn.close(); success = true; } finally { if (!success) { // this.close() will close in: IOUtils.closeWhileHandlingException(indexIn, this); } } }
From source file:com.vmware.xenon.services.common.Lucene60FieldInfosFormatWithCache.java
License:Open Source License
@Override public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException { ////////////////////// boolean checkInfosCache = true; ////////////////////// final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION); try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) { Throwable priorE = null;/*from ww w . j a v a 2 s. co m*/ FieldInfo[] infos = null; try { CodecUtil.checkIndexHeader(input, Lucene60FieldInfosFormatWithCache.CODEC_NAME, Lucene60FieldInfosFormatWithCache.FORMAT_START, Lucene60FieldInfosFormatWithCache.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix); final int size = input.readVInt(); //read in the size infos = new FieldInfo[size]; // previous field's attribute map, we share when possible: Map<String, String> lastAttributes = Collections.emptyMap(); for (int i = 0; i < size; i++) { String name = input.readString(); final int fieldNumber = input.readVInt(); if (fieldNumber < 0) { throw new CorruptIndexException( "invalid field number for field: " + name + ", fieldNumber=" + fieldNumber, input); } byte bits = input.readByte(); boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); // DV Types are packed in one byte final DocValuesType docValuesType = getDocValuesType(input, input.readByte()); final long dvGen = input.readLong(); Map<String, String> attributes = input.readMapOfStrings(); // just use the last field's map if its the same if (attributes.equals(lastAttributes)) { attributes = lastAttributes; } lastAttributes = attributes; int pointDimensionCount = input.readVInt(); int pointNumBytes; if (pointDimensionCount != 0) { pointNumBytes = input.readVInt(); } else { pointNumBytes = 0; } try { ////////////////////// if (dvGen >= 0) { // skip fields with docValues, they don't cache well checkInfosCache = false; infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, attributes, pointDimensionCount, pointNumBytes); } else { infos[i] = this.cache.dedupFieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, attributes, pointDimensionCount, pointNumBytes); } ////////////////////// } catch (IllegalStateException e) { throw new CorruptIndexException( "invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); } } } catch (Throwable exception) { priorE = exception; } finally { CodecUtil.checkFooter(input, priorE); } ////////////////////// if (checkInfosCache) { return this.cache.dedupFieldInfos(infos); } else { FieldInfos result = new FieldInfos(infos); this.cache.trimFieldInfos(result); return result; } ////////////////////// } }
From source file:perf.DiskUsage.java
License:Apache License
static Set<FieldStats> analyzeFields(SegmentReader reader) throws Exception { Map<String, FieldStats> stats = new HashMap<>(); Map<String, String> dvSuffixes = new HashMap<>(); Map<String, String> postingsSuffixes = new HashMap<>(); for (FieldInfo field : reader.getFieldInfos()) { FieldStats fieldStats = new FieldStats(field.name); stats.put(field.name, fieldStats); Map<String, String> attributes = field.attributes(); if (attributes != null) { String postingsSuffix = attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY); if (postingsSuffix != null) { postingsSuffixes.put(postingsSuffix, field.name); }// w ww . j a v a 2 s. c om String dvSuffix = attributes.get(PerFieldDocValuesFormat.PER_FIELD_SUFFIX_KEY); if (dvSuffix != null) { dvSuffixes.put(dvSuffix, field.name); } } Bits docsWithField = reader.getDocsWithField(field.name); if (docsWithField != null) { int count = 0; for (int docID = 0; docID < reader.maxDoc(); docID++) { if (docsWithField.get(docID)) { count++; } } fieldStats.docCountWithField = count; } } Directory directory = reader.directory(); for (String file : directory.listAll()) { String suffix = parseSuffix(file); long bytes = directory.fileLength(file); if (suffix != null) { switch (IndexFileNames.getExtension(file)) { case "dvd": case "dvm": stats.get(dvSuffixes.get(suffix)).dvBytes += bytes; break; case "tim": case "tip": stats.get(postingsSuffixes.get(suffix)).termsBytes += bytes; break; case "doc": stats.get(postingsSuffixes.get(suffix)).postingsBytes += bytes; break; case "pos": case "pay": stats.get(postingsSuffixes.get(suffix)).proxBytes += bytes; break; default: throw new AssertionError("unexpected suffixed file: " + file); } } else { // not a per-field file, but we can hackishly do this for the points case. if ("dii".equals(IndexFileNames.getExtension(file))) { System.err.println( "retrieving per-field point usage, if you see a scary corruption error, its probably just this tool!!!!"); try (ChecksumIndexInput in = directory.openChecksumInput(file, IOContext.READONCE)) { // fail hard if its not exactly the version we do this hack for. CodecUtil.checkIndexHeader(in, "Lucene60PointsFormatMeta", 0, 0, reader.getSegmentInfo().info.getId(), ""); int fieldCount = in.readVInt(); // strangely, bkd offsets are not in any guaranteed order TreeMap<Long, String> offsetToField = new TreeMap<>(); for (int i = 0; i < fieldCount; i++) { int field = in.readVInt(); long offset = in.readVLong(); offsetToField.put(offset, reader.getFieldInfos().fieldInfo(field).name); } // now we can traverse in order long previousOffset = 0; for (Map.Entry<Long, String> entry : offsetToField.entrySet()) { long offset = entry.getKey(); String field = entry.getValue(); stats.get(field).pointsBytes += (offset - previousOffset); previousOffset = offset; } CodecUtil.checkFooter(in); } } } } return new TreeSet<FieldStats>(stats.values()); }
From source file:perf.DiskUsage.java
License:Apache License
static Set<FieldStats> analyzeFields(SegmentReader reader) throws Exception { Map<String, FieldStats> stats = new HashMap<>(); Map<String, String> dvSuffixes = new HashMap<>(); Map<String, String> postingsSuffixes = new HashMap<>(); for (FieldInfo field : reader.getFieldInfos()) { FieldStats fieldStats = new FieldStats(field.name); stats.put(field.name, fieldStats); Map<String, String> attributes = field.attributes(); if (attributes != null) { String postingsSuffix = attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY); if (postingsSuffix != null) { postingsSuffixes.put(postingsSuffix, field.name); }/*from w ww . ja v a 2 s . c o m*/ String dvSuffix = attributes.get(PerFieldDocValuesFormat.PER_FIELD_SUFFIX_KEY); if (dvSuffix != null) { dvSuffixes.put(dvSuffix, field.name); } } DocIdSetIterator docsWithField; switch (field.getDocValuesType()) { case NUMERIC: docsWithField = reader.getNumericDocValues(field.name); break; case BINARY: docsWithField = reader.getBinaryDocValues(field.name); break; case SORTED: docsWithField = reader.getSortedDocValues(field.name); break; case SORTED_NUMERIC: docsWithField = reader.getSortedNumericDocValues(field.name); break; case SORTED_SET: docsWithField = reader.getSortedSetDocValues(field.name); break; case NONE: docsWithField = null; break; default: docsWithField = null; break; } if (docsWithField != null) { int count = 0; while (docsWithField.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { count++; } fieldStats.docCountWithField = count; } } Directory directory = reader.directory(); for (String file : directory.listAll()) { String suffix = parseSuffix(file); long bytes = directory.fileLength(file); if (suffix != null) { switch (IndexFileNames.getExtension(file)) { case "dvd": case "dvm": stats.get(dvSuffixes.get(suffix)).dvBytes += bytes; break; case "tim": case "tip": stats.get(postingsSuffixes.get(suffix)).termsBytes += bytes; break; case "doc": stats.get(postingsSuffixes.get(suffix)).postingsBytes += bytes; break; case "pos": case "pay": stats.get(postingsSuffixes.get(suffix)).proxBytes += bytes; break; default: throw new AssertionError("unexpected suffixed file: " + file); } } else { // not a per-field file, but we can hackishly do this for the points case. if ("dii".equals(IndexFileNames.getExtension(file))) { System.err.println( "retrieving per-field point usage, if you see a scary corruption error, its probably just this tool!!!!"); try (ChecksumIndexInput in = directory.openChecksumInput(file, IOContext.READONCE)) { // fail hard if its not exactly the version we do this hack for. CodecUtil.checkIndexHeader(in, "Lucene60PointsFormatMeta", 0, 0, reader.getSegmentInfo().info.getId(), ""); int fieldCount = in.readVInt(); // strangely, bkd offsets are not in any guaranteed order TreeMap<Long, String> offsetToField = new TreeMap<>(); for (int i = 0; i < fieldCount; i++) { int field = in.readVInt(); long offset = in.readVLong(); offsetToField.put(offset, reader.getFieldInfos().fieldInfo(field).name); } // now we can traverse in order long previousOffset = 0; for (Map.Entry<Long, String> entry : offsetToField.entrySet()) { long offset = entry.getKey(); String field = entry.getValue(); stats.get(field).pointsBytes += (offset - previousOffset); previousOffset = offset; } CodecUtil.checkFooter(in); } } } } return new TreeSet<FieldStats>(stats.values()); }