org.apache.maven.index.updater.IndexDataReader.java Source code

Introduction

Here is the source code for org.apache.maven.index.updater.IndexDataReader.java
Source

package org.apache.maven.index.updater;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.BufferedInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.UTFDataFormatException;
import java.util.Date;
import java.util.zip.GZIPInputStream;

import com.google.common.base.Strings;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.maven.index.ArtifactInfo;
import org.apache.maven.index.context.IndexUtils;
import org.apache.maven.index.context.IndexingContext;

/**
 * An index data reader used to parse transfer index format.
 *
 * @author Eugene Kuleshov
 */
public class IndexDataReader {
    private final DataInputStream dis;

    public IndexDataReader(final InputStream is) throws IOException {
        // MINDEXER-13
        // LightweightHttpWagon may have performed automatic decompression
        // Handle it transparently
        is.mark(2);
        InputStream data;
        if (is.read() == 0x1f && is.read() == 0x8b) // GZIPInputStream.GZIP_MAGIC
        {
            is.reset();
            data = new BufferedInputStream(new GZIPInputStream(is, 1024 * 8), 1024 * 8);
        } else {
            BufferedInputStream bis = new BufferedInputStream(is, 1024 * 8);
            bis.reset();
            data = bis;
        }

        this.dis = new DataInputStream(data);
    }

    public IndexDataReadResult readIndex(IndexWriter w, IndexingContext context) throws IOException {
        long timestamp = readHeader();

        Date date = null;

        if (timestamp != -1) {
            date = new Date(timestamp);

            IndexUtils.updateTimestamp(w.getDirectory(), date);
        }

        int n = 0;

        Document doc;
        Set<String> rootGroups = new LinkedHashSet<>();
        Set<String> allGroups = new LinkedHashSet<>();

        while ((doc = readDocument()) != null) {
            ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, context);
            if (ai != null) {
                w.addDocument(IndexUtils.updateDocument(doc, context, false, ai));

                rootGroups.add(ai.getRootGroup());
                allGroups.add(ai.getGroupId());

            } else {
                w.addDocument(doc);
            }
            n++;
        }

        w.commit();

        IndexDataReadResult result = new IndexDataReadResult();
        result.setDocumentCount(n);
        result.setTimestamp(date);
        result.setRootGroups(rootGroups);
        result.setAllGroups(allGroups);

        return result;
    }

    public long readHeader() throws IOException {
        final byte hdrbyte = (byte) ((IndexDataWriter.VERSION << 24) >> 24);

        if (hdrbyte != dis.readByte()) {
            // data format version mismatch
            throw new IOException("Provided input contains unexpected data (0x01 expected as 1st byte)!");
        }

        return dis.readLong();
    }

    public Document readDocument() throws IOException {
        int fieldCount;
        try {
            fieldCount = dis.readInt();
        } catch (EOFException ex) {
            return null; // no more documents
        }

        Document doc = new Document();

        for (int i = 0; i < fieldCount; i++) {
            doc.add(readField());
        }

        // Fix up UINFO field wrt MINDEXER-41
        final Field uinfoField = (Field) doc.getField(ArtifactInfo.UINFO);
        final String info = doc.get(ArtifactInfo.INFO);
        if (uinfoField != null && !Strings.isNullOrEmpty(info)) {
            final String[] splitInfo = ArtifactInfo.FS_PATTERN.split(info);
            if (splitInfo.length > 6) {
                final String extension = splitInfo[6];
                final String uinfoString = uinfoField.stringValue();
                if (uinfoString.endsWith(ArtifactInfo.FS + ArtifactInfo.NA)) {
                    uinfoField.setStringValue(uinfoString + ArtifactInfo.FS + ArtifactInfo.nvl(extension));
                }
            }
        }

        return doc;
    }

    private Field readField() throws IOException {
        int flags = dis.read();

        Index index = Index.NO;
        if ((flags & IndexDataWriter.F_INDEXED) > 0) {
            boolean isTokenized = (flags & IndexDataWriter.F_TOKENIZED) > 0;
            index = isTokenized ? Index.ANALYZED : Index.NOT_ANALYZED;
        }

        Store store = Store.NO;
        if ((flags & IndexDataWriter.F_STORED) > 0) {
            store = Store.YES;
        }

        String name = dis.readUTF();
        String value = readUTF(dis);

        return new Field(name, value, store, index);
    }

    private static String readUTF(DataInput in) throws IOException {
        int utflen = in.readInt();

        byte[] bytearr;
        char[] chararr;

        try {
            bytearr = new byte[utflen];
            chararr = new char[utflen];
        } catch (OutOfMemoryError e) {
            final IOException ex = new IOException(
                    "Index data content is inappropriate (is junk?), leads to OutOfMemoryError!"
                            + " See MINDEXER-28 for more information!");
            ex.initCause(e);
            throw ex;
        }

        int c, char2, char3;
        int count = 0;
        int chararrCount = 0;

        in.readFully(bytearr, 0, utflen);

        while (count < utflen) {
            c = bytearr[count] & 0xff;
            if (c > 127) {
                break;
            }
            count++;
            chararr[chararrCount++] = (char) c;
        }

        while (count < utflen) {
            c = bytearr[count] & 0xff;
            switch (c >> 4) {
            case 0:
            case 1:
            case 2:
            case 3:
            case 4:
            case 5:
            case 6:
            case 7:
                /* 0xxxxxxx */
                count++;
                chararr[chararrCount++] = (char) c;
                break;

            case 12:
            case 13:
                /* 110x xxxx 10xx xxxx */
                count += 2;
                if (count > utflen) {
                    throw new UTFDataFormatException("malformed input: partial character at end");
                }
                char2 = bytearr[count - 1];
                if ((char2 & 0xC0) != 0x80) {
                    throw new UTFDataFormatException("malformed input around byte " + count);
                }
                chararr[chararrCount++] = (char) (((c & 0x1F) << 6) | (char2 & 0x3F));
                break;

            case 14:
                /* 1110 xxxx 10xx xxxx 10xx xxxx */
                count += 3;
                if (count > utflen) {
                    throw new UTFDataFormatException("malformed input: partial character at end");
                }
                char2 = bytearr[count - 2];
                char3 = bytearr[count - 1];
                if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
                    throw new UTFDataFormatException("malformed input around byte " + (count - 1));
                }
                chararr[chararrCount++] = (char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6)
                        | ((char3 & 0x3F) << 0));
                break;

            default:
                /* 10xx xxxx, 1111 xxxx */
                throw new UTFDataFormatException("malformed input around byte " + count);
            }
        }

        // The number of chars produced may be less than utflen
        return new String(chararr, 0, chararrCount);
    }

    /**
     * An index data read result holder
     */
    public static class IndexDataReadResult {
        private Date timestamp;

        private int documentCount;

        private Set<String> rootGroups;

        private Set<String> allGroups;

        public void setDocumentCount(int documentCount) {
            this.documentCount = documentCount;
        }

        public int getDocumentCount() {
            return documentCount;
        }

        public void setTimestamp(Date timestamp) {
            this.timestamp = timestamp;
        }

        public Date getTimestamp() {
            return timestamp;
        }

        public void setRootGroups(Set<String> rootGroups) {
            this.rootGroups = rootGroups;
        }

        public Set<String> getRootGroups() {
            return rootGroups;
        }

        public void setAllGroups(Set<String> allGroups) {
            this.allGroups = allGroups;
        }

        public Set<String> getAllGroups() {
            return allGroups;
        }

    }

    /**
     * Reads index content by using a visitor. <br>
     * The visitor is called for each read documents after it has been populated with Lucene fields.
     *
     * @param visitor an index data visitor
     * @param context indexing context
     * @return statistics about read data
     * @throws IOException in case of an IO exception during index file access
     */
    public IndexDataReadResult readIndex(final IndexDataReadVisitor visitor, final IndexingContext context)
            throws IOException {
        dis.readByte(); // data format version

        long timestamp = dis.readLong();

        Date date = null;

        if (timestamp != -1) {
            date = new Date(timestamp);
        }

        int n = 0;

        Document doc;
        while ((doc = readDocument()) != null) {
            visitor.visitDocument(IndexUtils.updateDocument(doc, context, false));

            n++;
        }

        IndexDataReadResult result = new IndexDataReadResult();
        result.setDocumentCount(n);
        result.setTimestamp(date);
        return result;
    }

    /**
     * Visitor of indexed Lucene documents.
     */
    public interface IndexDataReadVisitor {

        /**
         * Called on each read document. The document is already populated with fields.
         *
         * @param document read document
         */
        void visitDocument(Document document);

    }

}