Example usage for org.apache.lucene.codecs CodecUtil checkIndexHeader

List of usage examples for org.apache.lucene.codecs CodecUtil checkIndexHeader

Introduction

In this page you can find the example usage for org.apache.lucene.codecs CodecUtil checkIndexHeader.

Prototype

public static int checkIndexHeader(DataInput in, String codec, int minVersion, int maxVersion,
        byte[] expectedID, String expectedSuffix) throws IOException 

Source Link

Document

Reads and validates a header previously written with #writeIndexHeader(DataOutput,String,int,byte[],String) .

Usage

From source file:com.rocana.lucene.codec.v1.RocanaBlockTreeTermsReader.java

License:Apache License

/** Sole constructor. */
public RocanaBlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state)
        throws IOException {
    boolean success = false;
    IndexInput indexIn = null;/*from w w w.j  a  va  2 s.  c  o  m*/

    this.postingsReader = postingsReader;
    this.segment = state.segmentInfo.name;

    String termsName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION);
    try {
        termsIn = state.directory.openInput(termsName, state.context);
        version = CodecUtil.checkIndexHeader(termsIn, TERMS_CODEC_NAME, VERSION_START, VERSION_CURRENT,
                state.segmentInfo.getId(), state.segmentSuffix);

        if (version < VERSION_AUTO_PREFIX_TERMS) {
            // Old (pre-5.2.0) index, no auto-prefix terms:
            this.anyAutoPrefixTerms = false;
        } else if (version == VERSION_AUTO_PREFIX_TERMS) {
            // 5.2.x index, might have auto-prefix terms:
            this.anyAutoPrefixTerms = true;
        } else {
            // 5.3.x index, we record up front if we may have written any auto-prefix terms:
            assert version >= VERSION_AUTO_PREFIX_TERMS_COND;
            byte b = termsIn.readByte();
            if (b == 0) {
                this.anyAutoPrefixTerms = false;
            } else if (b == 1) {
                this.anyAutoPrefixTerms = true;
            } else {
                throw new CorruptIndexException("invalid anyAutoPrefixTerms: expected 0 or 1 but got " + b,
                        termsIn);
            }
        }

        String indexName = IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION);
        indexIn = state.directory.openInput(indexName, state.context);
        CodecUtil.checkIndexHeader(indexIn, TERMS_INDEX_CODEC_NAME, version, version, state.segmentInfo.getId(),
                state.segmentSuffix);

        // IMPORTANT: comment out this one line to prevent checksumming the entire file.
        //            This is the reason we have a custom Lucene codec and forked Lucene classes.
        //CodecUtil.checksumEntireFile(indexIn);

        // Have PostingsReader init itself
        postingsReader.init(termsIn, state);

        // NOTE: data file is too costly to verify checksum against all the bytes on open,
        // but for now we at least verify proper structure of the checksum footer: which looks
        // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
        // such as file truncation.
        CodecUtil.retrieveChecksum(termsIn);

        // Read per-field details
        seekDir(termsIn, dirOffset);
        seekDir(indexIn, indexDirOffset);

        final int numFields = termsIn.readVInt();
        if (numFields < 0) {
            throw new CorruptIndexException("invalid numFields: " + numFields, termsIn);
        }

        for (int i = 0; i < numFields; ++i) {
            final int field = termsIn.readVInt();
            final long numTerms = termsIn.readVLong();
            if (numTerms <= 0) {
                throw new CorruptIndexException("Illegal numTerms for field number: " + field, termsIn);
            }
            final int numBytes = termsIn.readVInt();
            if (numBytes < 0) {
                throw new CorruptIndexException(
                        "invalid rootCode for field number: " + field + ", numBytes=" + numBytes, termsIn);
            }
            final BytesRef rootCode = new BytesRef(new byte[numBytes]);
            termsIn.readBytes(rootCode.bytes, 0, numBytes);
            rootCode.length = numBytes;
            final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
            if (fieldInfo == null) {
                throw new CorruptIndexException("invalid field number: " + field, termsIn);
            }
            final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? -1
                    : termsIn.readVLong();
            final long sumDocFreq = termsIn.readVLong();
            final int docCount = termsIn.readVInt();
            final int longsSize = termsIn.readVInt();
            if (longsSize < 0) {
                throw new CorruptIndexException(
                        "invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn);
            }
            BytesRef minTerm = readBytesRef(termsIn);
            BytesRef maxTerm = readBytesRef(termsIn);
            if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs
                throw new CorruptIndexException(
                        "invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), termsIn);
            }
            if (sumDocFreq < docCount) { // #postings must be >= #docs with field
                throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount,
                        termsIn);
            }
            if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
                throw new CorruptIndexException(
                        "invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq,
                        termsIn);
            }
            final long indexStartFP = indexIn.readVLong();
            RocanaFieldReader previous = fields.put(fieldInfo.name,
                    new RocanaFieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq,
                            docCount, indexStartFP, longsSize, indexIn, minTerm, maxTerm));
            if (previous != null) {
                throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn);
            }
        }

        indexIn.close();
        success = true;
    } finally {
        if (!success) {
            // this.close() will close in:
            IOUtils.closeWhileHandlingException(indexIn, this);
        }
    }
}

From source file:com.vmware.xenon.services.common.Lucene60FieldInfosFormatWithCache.java

License:Open Source License

@Override
public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context)
        throws IOException {
    //////////////////////
    boolean checkInfosCache = true;
    //////////////////////
    final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION);
    try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) {
        Throwable priorE = null;/*from ww w  .  j a v  a 2 s. co m*/
        FieldInfo[] infos = null;
        try {
            CodecUtil.checkIndexHeader(input, Lucene60FieldInfosFormatWithCache.CODEC_NAME,
                    Lucene60FieldInfosFormatWithCache.FORMAT_START,
                    Lucene60FieldInfosFormatWithCache.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix);

            final int size = input.readVInt(); //read in the size
            infos = new FieldInfo[size];

            // previous field's attribute map, we share when possible:
            Map<String, String> lastAttributes = Collections.emptyMap();

            for (int i = 0; i < size; i++) {
                String name = input.readString();
                final int fieldNumber = input.readVInt();
                if (fieldNumber < 0) {
                    throw new CorruptIndexException(
                            "invalid field number for field: " + name + ", fieldNumber=" + fieldNumber, input);
                }
                byte bits = input.readByte();
                boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
                boolean omitNorms = (bits & OMIT_NORMS) != 0;
                boolean storePayloads = (bits & STORE_PAYLOADS) != 0;

                final IndexOptions indexOptions = getIndexOptions(input, input.readByte());

                // DV Types are packed in one byte
                final DocValuesType docValuesType = getDocValuesType(input, input.readByte());
                final long dvGen = input.readLong();
                Map<String, String> attributes = input.readMapOfStrings();
                // just use the last field's map if its the same
                if (attributes.equals(lastAttributes)) {
                    attributes = lastAttributes;
                }
                lastAttributes = attributes;
                int pointDimensionCount = input.readVInt();
                int pointNumBytes;
                if (pointDimensionCount != 0) {
                    pointNumBytes = input.readVInt();
                } else {
                    pointNumBytes = 0;
                }

                try {
                    //////////////////////
                    if (dvGen >= 0) {
                        // skip fields with docValues, they don't cache well
                        checkInfosCache = false;
                        infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads,
                                indexOptions, docValuesType, dvGen, attributes, pointDimensionCount,
                                pointNumBytes);
                    } else {
                        infos[i] = this.cache.dedupFieldInfo(name, fieldNumber, storeTermVector, omitNorms,
                                storePayloads, indexOptions, docValuesType, dvGen, attributes,
                                pointDimensionCount, pointNumBytes);
                    }
                    //////////////////////
                } catch (IllegalStateException e) {
                    throw new CorruptIndexException(
                            "invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
                }
            }
        } catch (Throwable exception) {
            priorE = exception;
        } finally {
            CodecUtil.checkFooter(input, priorE);
        }

        //////////////////////
        if (checkInfosCache) {
            return this.cache.dedupFieldInfos(infos);
        } else {
            FieldInfos result = new FieldInfos(infos);
            this.cache.trimFieldInfos(result);
            return result;
        }
        //////////////////////
    }
}

From source file:perf.DiskUsage.java

License:Apache License

static Set<FieldStats> analyzeFields(SegmentReader reader) throws Exception {
    Map<String, FieldStats> stats = new HashMap<>();
    Map<String, String> dvSuffixes = new HashMap<>();
    Map<String, String> postingsSuffixes = new HashMap<>();
    for (FieldInfo field : reader.getFieldInfos()) {
        FieldStats fieldStats = new FieldStats(field.name);
        stats.put(field.name, fieldStats);
        Map<String, String> attributes = field.attributes();
        if (attributes != null) {
            String postingsSuffix = attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY);
            if (postingsSuffix != null) {
                postingsSuffixes.put(postingsSuffix, field.name);
            }// w  ww  . j a v a  2 s. c  om
            String dvSuffix = attributes.get(PerFieldDocValuesFormat.PER_FIELD_SUFFIX_KEY);
            if (dvSuffix != null) {
                dvSuffixes.put(dvSuffix, field.name);
            }
        }

        Bits docsWithField = reader.getDocsWithField(field.name);
        if (docsWithField != null) {
            int count = 0;
            for (int docID = 0; docID < reader.maxDoc(); docID++) {
                if (docsWithField.get(docID)) {
                    count++;
                }
            }
            fieldStats.docCountWithField = count;
        }
    }

    Directory directory = reader.directory();
    for (String file : directory.listAll()) {
        String suffix = parseSuffix(file);
        long bytes = directory.fileLength(file);
        if (suffix != null) {
            switch (IndexFileNames.getExtension(file)) {
            case "dvd":
            case "dvm":
                stats.get(dvSuffixes.get(suffix)).dvBytes += bytes;
                break;
            case "tim":
            case "tip":
                stats.get(postingsSuffixes.get(suffix)).termsBytes += bytes;
                break;
            case "doc":
                stats.get(postingsSuffixes.get(suffix)).postingsBytes += bytes;
                break;
            case "pos":
            case "pay":
                stats.get(postingsSuffixes.get(suffix)).proxBytes += bytes;
                break;
            default:
                throw new AssertionError("unexpected suffixed file: " + file);
            }
        } else {
            // not a per-field file, but we can hackishly do this for the points case.
            if ("dii".equals(IndexFileNames.getExtension(file))) {
                System.err.println(
                        "retrieving per-field point usage, if you see a scary corruption error, its probably just this tool!!!!");
                try (ChecksumIndexInput in = directory.openChecksumInput(file, IOContext.READONCE)) {
                    // fail hard if its not exactly the version we do this hack for.
                    CodecUtil.checkIndexHeader(in, "Lucene60PointsFormatMeta", 0, 0,
                            reader.getSegmentInfo().info.getId(), "");
                    int fieldCount = in.readVInt();
                    // strangely, bkd offsets are not in any guaranteed order
                    TreeMap<Long, String> offsetToField = new TreeMap<>();
                    for (int i = 0; i < fieldCount; i++) {
                        int field = in.readVInt();
                        long offset = in.readVLong();
                        offsetToField.put(offset, reader.getFieldInfos().fieldInfo(field).name);
                    }
                    // now we can traverse in order
                    long previousOffset = 0;
                    for (Map.Entry<Long, String> entry : offsetToField.entrySet()) {
                        long offset = entry.getKey();
                        String field = entry.getValue();
                        stats.get(field).pointsBytes += (offset - previousOffset);
                        previousOffset = offset;
                    }
                    CodecUtil.checkFooter(in);
                }
            }
        }
    }

    return new TreeSet<FieldStats>(stats.values());
}

From source file:perf.DiskUsage.java

License:Apache License

static Set<FieldStats> analyzeFields(SegmentReader reader) throws Exception {
    Map<String, FieldStats> stats = new HashMap<>();
    Map<String, String> dvSuffixes = new HashMap<>();
    Map<String, String> postingsSuffixes = new HashMap<>();
    for (FieldInfo field : reader.getFieldInfos()) {
        FieldStats fieldStats = new FieldStats(field.name);
        stats.put(field.name, fieldStats);
        Map<String, String> attributes = field.attributes();
        if (attributes != null) {
            String postingsSuffix = attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY);
            if (postingsSuffix != null) {
                postingsSuffixes.put(postingsSuffix, field.name);
            }/*from w ww .  ja  v a  2 s . c o m*/
            String dvSuffix = attributes.get(PerFieldDocValuesFormat.PER_FIELD_SUFFIX_KEY);
            if (dvSuffix != null) {
                dvSuffixes.put(dvSuffix, field.name);
            }
        }

        DocIdSetIterator docsWithField;
        switch (field.getDocValuesType()) {
        case NUMERIC:
            docsWithField = reader.getNumericDocValues(field.name);
            break;
        case BINARY:
            docsWithField = reader.getBinaryDocValues(field.name);
            break;
        case SORTED:
            docsWithField = reader.getSortedDocValues(field.name);
            break;
        case SORTED_NUMERIC:
            docsWithField = reader.getSortedNumericDocValues(field.name);
            break;
        case SORTED_SET:
            docsWithField = reader.getSortedSetDocValues(field.name);
            break;
        case NONE:
            docsWithField = null;
            break;
        default:
            docsWithField = null;
            break;
        }

        if (docsWithField != null) {
            int count = 0;
            while (docsWithField.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                count++;
            }
            fieldStats.docCountWithField = count;
        }
    }

    Directory directory = reader.directory();
    for (String file : directory.listAll()) {
        String suffix = parseSuffix(file);
        long bytes = directory.fileLength(file);
        if (suffix != null) {
            switch (IndexFileNames.getExtension(file)) {
            case "dvd":
            case "dvm":
                stats.get(dvSuffixes.get(suffix)).dvBytes += bytes;
                break;
            case "tim":
            case "tip":
                stats.get(postingsSuffixes.get(suffix)).termsBytes += bytes;
                break;
            case "doc":
                stats.get(postingsSuffixes.get(suffix)).postingsBytes += bytes;
                break;
            case "pos":
            case "pay":
                stats.get(postingsSuffixes.get(suffix)).proxBytes += bytes;
                break;
            default:
                throw new AssertionError("unexpected suffixed file: " + file);
            }
        } else {
            // not a per-field file, but we can hackishly do this for the points case.
            if ("dii".equals(IndexFileNames.getExtension(file))) {
                System.err.println(
                        "retrieving per-field point usage, if you see a scary corruption error, its probably just this tool!!!!");
                try (ChecksumIndexInput in = directory.openChecksumInput(file, IOContext.READONCE)) {
                    // fail hard if its not exactly the version we do this hack for.
                    CodecUtil.checkIndexHeader(in, "Lucene60PointsFormatMeta", 0, 0,
                            reader.getSegmentInfo().info.getId(), "");
                    int fieldCount = in.readVInt();
                    // strangely, bkd offsets are not in any guaranteed order
                    TreeMap<Long, String> offsetToField = new TreeMap<>();
                    for (int i = 0; i < fieldCount; i++) {
                        int field = in.readVInt();
                        long offset = in.readVLong();
                        offsetToField.put(offset, reader.getFieldInfos().fieldInfo(field).name);
                    }
                    // now we can traverse in order
                    long previousOffset = 0;
                    for (Map.Entry<Long, String> entry : offsetToField.entrySet()) {
                        long offset = entry.getKey();
                        String field = entry.getValue();
                        stats.get(field).pointsBytes += (offset - previousOffset);
                        previousOffset = offset;
                    }
                    CodecUtil.checkFooter(in);
                }
            }
        }
    }

    return new TreeSet<FieldStats>(stats.values());
}