org.fastcatsearch.ir.document.DocumentReader.java Source code

Java tutorial

Introduction

Here is the source code for org.fastcatsearch.ir.document.DocumentReader.java

Source

/*
 * Copyright 2013 Websquared, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fastcatsearch.ir.document;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

import org.apache.commons.io.input.BoundedInputStream;
import org.apache.lucene.util.BytesRef;
import org.fastcatsearch.ir.common.IndexFileNames;
import org.fastcatsearch.ir.field.Field;
import org.fastcatsearch.ir.field.FieldDataParseException;
import org.fastcatsearch.ir.io.BufferedFileInput;
import org.fastcatsearch.ir.io.ByteRefArrayOutputStream;
import org.fastcatsearch.ir.io.BytesDataInput;
import org.fastcatsearch.ir.io.DataInput;
import org.fastcatsearch.ir.io.IOUtil;
import org.fastcatsearch.ir.io.IndexInput;
import org.fastcatsearch.ir.settings.FieldSetting;
import org.fastcatsearch.ir.settings.SchemaSetting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *   0   read. baseNo ? .
 * */

public class DocumentReader implements Cloneable {
    private static Logger logger = LoggerFactory.getLogger(DocumentReader.class);

    private static final int INFLATE_BUFFER_INIT_SIZE = 20 * 1024;
    private List<FieldSetting> fields;
    private IndexInput docInput;
    private IndexInput positionInput;
    private ByteRefArrayOutputStream inflaterOutput;
    private byte[] workingBuffer;

    private int baseDocNo;
    private int documentCount;
    private int lastDocNo = -1;
    private DataInput lastBai;
    private long positionLimit;

    public DocumentReader() {
    }

    public DocumentReader(SchemaSetting schemaSetting, File dir) throws IOException {
        this(schemaSetting, dir, 0);
    }

    public DocumentReader(SchemaSetting schemaSetting, File dir, int baseDocNo) throws IOException {
        this.baseDocNo = baseDocNo;
        fields = schemaSetting.getFieldSettingList();
        docInput = new BufferedFileInput(dir, IndexFileNames.docStored);
        positionInput = new BufferedFileInput(dir, IndexFileNames.docPosition);
        positionLimit = positionInput.length();
        documentCount = docInput.readInt();
        logger.info("DocumentCount = {}", documentCount);

        inflaterOutput = new ByteRefArrayOutputStream(INFLATE_BUFFER_INIT_SIZE); // ?? ??.  20KB . ? 3MB?.
        workingBuffer = new byte[1024];
    }

    public int getDocumentCount() {
        return documentCount;
    }

    public int getBaseNumber() {
        return baseDocNo;
    }

    //   .
    public Document readDocument(int docNo) throws IOException {
        return readDocument(docNo, null);
    }

    public Document readIndexableDocument(int docNo) throws IOException {
        return readDocument(docNo, null, true);
    }

    public Document readDocument(int docNo, boolean[] fieldSelectOption) throws IOException {
        return readDocument(docNo, fieldSelectOption, false);
    }

    public Document readDocument(int docNo, boolean[] fieldSelectOption, boolean indexable) throws IOException {
        // if(docNo < baseDocNo) throw new
        // IOException("Request docNo cannot less than baseDocNo! docNo = "+docNo+", baseDocNo = "+baseDocNo);

        // baseDocNo?    .
        // docNo -= baseDocNo;

        DataInput bai = null;

        if (docNo != lastDocNo) {
            long positionOffset = docNo * IOUtil.SIZE_OF_LONG;
            if (positionOffset >= positionLimit) {
                //.
                return null;
            }
            positionInput.seek(positionOffset);
            long pos = positionInput.readLong();
            // find a document block
            docInput.seek(pos);
            int len = docInput.readInt();

            //2014-11-26 ?  working ?   ? ? GC ?? OOM ? ?.
            // Stream  .
            InflaterInputStream decompressInputStream = null;
            inflaterOutput.reset();
            int count = -1;
            try {
                BoundedInputStream boundedInputStream = new BoundedInputStream(docInput, len);
                boundedInputStream.setPropagateClose(false);// docInput  .
                decompressInputStream = new InflaterInputStream(boundedInputStream, new Inflater(), 512);
                while ((count = decompressInputStream.read(workingBuffer)) != -1) {
                    inflaterOutput.write(workingBuffer, 0, count);
                }
            } finally {
                decompressInputStream.close();
            }

            BytesRef bytesRef = inflaterOutput.getBytesRef();
            bai = new BytesDataInput(bytesRef.bytes, 0, bytesRef.length);

            lastDocNo = docNo;
            lastBai = bai;
        } else {
            lastBai.reset();
            bai = lastBai;
        }

        Document document = new Document(fields.size());
        for (int i = 0; i < fields.size(); i++) {
            FieldSetting fs = fields.get(i);
            Field f = null;
            boolean hasValue = bai.readBoolean();
            //         logger.debug("read hasValue={}, select={}, fs={} ", hasValue, fieldSelectOption, fs);
            if (hasValue) {
                //1. fieldSelectOption ?  ? ??.
                //2. ? , true? ? ?.
                if (fieldSelectOption == null || (fieldSelectOption != null && fieldSelectOption[i])) {
                    f = fs.createEmptyField();
                    f.readRawFrom(bai);
                } else {
                    bai.skipVIntData();
                }
                //            logger.debug("fill {} >> {}", i, f);
            } else {
                //?  ?   .
                f = fs.createEmptyField();
                //            logger.debug("fill {} >> empty", i);
            }
            if (f != null && indexable) {
                String multiValueDelimiter = fs.getMultiValueDelimiter();
                try {
                    f.parseIndexable(multiValueDelimiter);
                } catch (FieldDataParseException e) {
                    throw new IOException(e);
                }
            }
            document.set(i, f);
        }

        document.setDocId(docNo + baseDocNo);

        return document;
    }

    @Override
    public DocumentReader clone() {
        DocumentReader reader = new DocumentReader();
        reader.fields = fields;
        reader.docInput = docInput.clone();
        reader.positionInput = positionInput.clone();
        reader.baseDocNo = baseDocNo;
        reader.documentCount = documentCount;

        reader.inflaterOutput = new ByteRefArrayOutputStream(INFLATE_BUFFER_INIT_SIZE); // ?? ??.
        reader.workingBuffer = new byte[1024];
        reader.positionLimit = positionLimit;
        return reader;
    }

    public void close() throws IOException {
        docInput.close();
        positionInput.close();
    }
}