org.apache.lucene.benchmark.byTask.feeds.DocMaker.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.benchmark.byTask.feeds.DocMaker.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.byTask.feeds;

import java.io.Closeable;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.TimeZone;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;

/**
 * Creates {@link Document} objects. Uses a {@link ContentSource} to generate
 * {@link DocData} objects. Supports the following parameters:
 * <ul>
 * <li><b>content.source</b> - specifies the {@link ContentSource} class to use
 * (default <b>SingleDocSource</b>).
 * <li><b>doc.stored</b> - specifies whether fields should be stored (default
 * <b>false</b>).
 * <li><b>doc.body.stored</b> - specifies whether the body field should be stored (default
 * = <b>doc.stored</b>).
 * <li><b>doc.tokenized</b> - specifies whether fields should be tokenized
 * (default <b>true</b>).
 * <li><b>doc.body.tokenized</b> - specifies whether the
 * body field should be tokenized (default = <b>doc.tokenized</b>).
 * <li><b>doc.body.offsets</b> - specifies whether to add offsets into the postings index
 *  for the body field.  It is useful for highlighting.  (default <b>false</b>)
 * <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
 * the index or not. (default <b>false</b>).
 * <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
 * stored in the index for the body field. This can be set to true, while
 * <code>doc.tokenized.norms</code> is set to false, to allow norms storing just
 * for the body field. (default <b>true</b>).
 * <li><b>doc.term.vector</b> - specifies whether term vectors should be stored
 * for fields (default <b>false</b>).
 * <li><b>doc.term.vector.positions</b> - specifies whether term vectors should
 * be stored with positions (default <b>false</b>).
 * <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be
 * stored with offsets (default <b>false</b>).
 * <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of
 * the document's content in the document (default <b>false</b>).
 * <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
 * should be reused (default <b>true</b>).
 * <li><b>doc.index.props</b> - specifies whether the properties returned by
 * <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random
 * IDs from 0 to this limit.  This is useful with UpdateDoc
 * for testing performance of IndexWriter.updateDocument.
 * {@link DocData#getProps()} will be indexed. (default <b>false</b>).
 * </ul>
 */
public class DocMaker implements Closeable {

    private static class LeftOver {
        private DocData docdata;
        private int cnt;
    }

    private Random r;
    private int updateDocIDLimit;

    /**
     * Document state, supports reuse of field instances
     * across documents (see <code>reuseFields</code> parameter).
     */
    protected static class DocState {

        private final Map<String, Field> fields;
        private final Map<String, Field> numericFields;
        private final boolean reuseFields;
        final Document doc;
        DocData docData = new DocData();

        public DocState(boolean reuseFields, FieldType ft, FieldType bodyFt) {

            this.reuseFields = reuseFields;

            if (reuseFields) {
                fields = new HashMap<>();
                numericFields = new HashMap<>();

                // Initialize the map with the default fields.
                fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyFt));
                fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", ft));
                fields.put(DATE_FIELD, new Field(DATE_FIELD, "", ft));
                fields.put(ID_FIELD, new StringField(ID_FIELD, "", Field.Store.YES));
                fields.put(NAME_FIELD, new Field(NAME_FIELD, "", ft));

                numericFields.put(DATE_MSEC_FIELD, new LongPoint(DATE_MSEC_FIELD, 0L));
                numericFields.put(TIME_SEC_FIELD, new IntPoint(TIME_SEC_FIELD, 0));

                doc = new Document();
            } else {
                numericFields = null;
                fields = null;
                doc = null;
            }
        }

        /**
         * Returns a field corresponding to the field name. If
         * <code>reuseFields</code> was set to true, then it attempts to reuse a
         * Field instance. If such a field does not exist, it creates a new one.
         */
        Field getField(String name, FieldType ft) {
            if (!reuseFields) {
                return new Field(name, "", ft);
            }

            Field f = fields.get(name);
            if (f == null) {
                f = new Field(name, "", ft);
                fields.put(name, f);
            }
            return f;
        }

        Field getNumericField(String name, Class<? extends Number> numericType) {
            Field f;
            if (reuseFields) {
                f = numericFields.get(name);
            } else {
                f = null;
            }

            if (f == null) {
                if (numericType.equals(Integer.class)) {
                    f = new IntPoint(name, 0);
                } else if (numericType.equals(Long.class)) {
                    f = new LongPoint(name, 0L);
                } else if (numericType.equals(Float.class)) {
                    f = new FloatPoint(name, 0.0F);
                } else if (numericType.equals(Double.class)) {
                    f = new DoublePoint(name, 0.0);
                } else {
                    throw new UnsupportedOperationException("Unsupported numeric type: " + numericType);
                }
                if (reuseFields) {
                    numericFields.put(name, f);
                }
            }
            return f;
        }
    }

    private boolean storeBytes = false;

    private static class DateUtil {
        public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH);
        public Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
        public ParsePosition pos = new ParsePosition(0);

        public DateUtil() {
            parser.setLenient(true);
        }
    }

    // leftovers are thread local, because it is unsafe to share residues between threads
    private ThreadLocal<LeftOver> leftovr = new ThreadLocal<>();
    private ThreadLocal<DocState> docState = new ThreadLocal<>();
    private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<>();

    public static final String BODY_FIELD = "body";
    public static final String TITLE_FIELD = "doctitle";
    public static final String DATE_FIELD = "docdate";
    public static final String DATE_MSEC_FIELD = "docdatenum";
    public static final String TIME_SEC_FIELD = "doctimesecnum";
    public static final String ID_FIELD = "docid";
    public static final String BYTES_FIELD = "bytes";
    public static final String NAME_FIELD = "docname";

    protected Config config;

    protected FieldType valType;
    protected FieldType bodyValType;

    protected ContentSource source;
    protected boolean reuseFields;
    protected boolean indexProperties;

    private final AtomicInteger numDocsCreated = new AtomicInteger();

    public DocMaker() {
    }

    // create a doc
    // use only part of the body, modify it to keep the rest (or use all if size==0).
    // reset the docdata properties so they are not added more than once.
    private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {

        final DocState ds = getDocState();
        final Document doc = reuseFields ? ds.doc : new Document();
        doc.clear();

        // Set ID_FIELD
        FieldType ft = new FieldType(valType);
        ft.setStored(true);

        Field idField = ds.getField(ID_FIELD, ft);
        int id;
        if (r != null) {
            id = r.nextInt(updateDocIDLimit);
        } else {
            id = docData.getID();
            if (id == -1) {
                id = numDocsCreated.getAndIncrement();
            }
        }
        idField.setStringValue(Integer.toString(id));
        doc.add(idField);

        // Set NAME_FIELD
        String name = docData.getName();
        if (name == null)
            name = "";
        name = cnt < 0 ? name : name + "_" + cnt;
        Field nameField = ds.getField(NAME_FIELD, valType);
        nameField.setStringValue(name);
        doc.add(nameField);

        // Set DATE_FIELD
        DateUtil util = dateParsers.get();
        if (util == null) {
            util = new DateUtil();
            dateParsers.set(util);
        }
        Date date = null;
        String dateString = docData.getDate();
        if (dateString != null) {
            util.pos.setIndex(0);
            date = util.parser.parse(dateString, util.pos);
            //System.out.println(dateString + " parsed to " + date);
        } else {
            dateString = "";
        }
        Field dateStringField = ds.getField(DATE_FIELD, valType);
        dateStringField.setStringValue(dateString);
        doc.add(dateStringField);

        if (date == null) {
            // just set to right now
            date = new Date();
        }

        Field dateField = ds.getNumericField(DATE_MSEC_FIELD, Long.class);
        dateField.setLongValue(date.getTime());
        doc.add(dateField);

        util.cal.setTime(date);
        final int sec = util.cal.get(Calendar.HOUR_OF_DAY) * 3600 + util.cal.get(Calendar.MINUTE) * 60
                + util.cal.get(Calendar.SECOND);

        Field timeSecField = ds.getNumericField(TIME_SEC_FIELD, Integer.class);
        timeSecField.setIntValue(sec);
        doc.add(timeSecField);

        // Set TITLE_FIELD
        String title = docData.getTitle();
        Field titleField = ds.getField(TITLE_FIELD, valType);
        titleField.setStringValue(title == null ? "" : title);
        doc.add(titleField);

        String body = docData.getBody();
        if (body != null && body.length() > 0) {
            String bdy;
            if (size <= 0 || size >= body.length()) {
                bdy = body; // use all
                docData.setBody(""); // nothing left
            } else {
                // attempt not to break words - if whitespace found within next 20 chars...
                for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
                    if (Character.isWhitespace(body.charAt(n))) {
                        size = n;
                        break;
                    }
                }
                bdy = body.substring(0, size); // use part
                docData.setBody(body.substring(size)); // some left
            }
            Field bodyField = ds.getField(BODY_FIELD, bodyValType);
            bodyField.setStringValue(bdy);
            doc.add(bodyField);

            if (storeBytes) {
                Field bytesField = ds.getField(BYTES_FIELD, StringField.TYPE_STORED);
                bytesField.setBytesValue(bdy.getBytes(StandardCharsets.UTF_8));
                doc.add(bytesField);
            }
        }

        if (indexProperties) {
            Properties props = docData.getProps();
            if (props != null) {
                for (final Map.Entry<Object, Object> entry : props.entrySet()) {
                    Field f = ds.getField((String) entry.getKey(), valType);
                    f.setStringValue((String) entry.getValue());
                    doc.add(f);
                }
                docData.setProps(null);
            }
        }

        //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
        return doc;
    }

    private void resetLeftovers() {
        leftovr.set(null);
    }

    protected DocState getDocState() {
        DocState ds = docState.get();
        if (ds == null) {
            ds = new DocState(reuseFields, valType, bodyValType);
            docState.set(ds);
        }
        return ds;
    }

    /**
     * Closes the {@link DocMaker}. The base implementation closes the
     * {@link ContentSource}, and it can be overridden to do more work (but make
     * sure to call super.close()).
     */
    @Override
    public void close() throws IOException {
        source.close();
    }

    /**
     * Creates a {@link Document} object ready for indexing. This method uses the
     * {@link ContentSource} to get the next document from the source, and creates
     * a {@link Document} object from the returned fields. If
     * <code>reuseFields</code> was set to true, it will reuse {@link Document}
     * and {@link Field} instances.
     */
    public Document makeDocument() throws Exception {
        resetLeftovers();
        DocData docData = source.getNextDocData(getDocState().docData);
        Document doc = createDocument(docData, 0, -1);
        return doc;
    }

    /**
     * Same as {@link #makeDocument()}, only this method creates a document of the
     * given size input by <code>size</code>.
     */
    public Document makeDocument(int size) throws Exception {
        LeftOver lvr = leftovr.get();
        if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
                || lvr.docdata.getBody().length() == 0) {
            resetLeftovers();
        }
        DocData docData = getDocState().docData;
        DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
        int cnt = (lvr == null ? 0 : lvr.cnt);
        while (dd.getBody() == null || dd.getBody().length() < size) {
            DocData dd2 = dd;
            dd = source.getNextDocData(new DocData());
            cnt = 0;
            dd.setBody(dd2.getBody() + dd.getBody());
        }
        Document doc = createDocument(dd, size, cnt);
        if (dd.getBody() == null || dd.getBody().length() == 0) {
            resetLeftovers();
        } else {
            if (lvr == null) {
                lvr = new LeftOver();
                leftovr.set(lvr);
            }
            lvr.docdata = dd;
            lvr.cnt = ++cnt;
        }
        return doc;
    }

    /** Reset inputs so that the test run would behave, input wise, as if it just started. */
    public synchronized void resetInputs() throws IOException {
        source.printStatistics("docs");
        // re-initiate since properties by round may have changed.
        setConfig(config, source);
        source.resetInputs();
        numDocsCreated.set(0);
        resetLeftovers();
    }

    /** Set the configuration parameters of this doc maker. */
    public void setConfig(Config config, ContentSource source) {
        this.config = config;
        this.source = source;

        boolean stored = config.get("doc.stored", false);
        boolean bodyStored = config.get("doc.body.stored", stored);
        boolean tokenized = config.get("doc.tokenized", true);
        boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
        boolean norms = config.get("doc.tokenized.norms", false);
        boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
        boolean bodyOffsets = config.get("doc.body.offsets", false);
        boolean termVec = config.get("doc.term.vector", false);
        boolean termVecPositions = config.get("doc.term.vector.positions", false);
        boolean termVecOffsets = config.get("doc.term.vector.offsets", false);

        valType = new FieldType(TextField.TYPE_NOT_STORED);
        valType.setStored(stored);
        valType.setTokenized(tokenized);
        valType.setOmitNorms(!norms);
        valType.setStoreTermVectors(termVec);
        valType.setStoreTermVectorPositions(termVecPositions);
        valType.setStoreTermVectorOffsets(termVecOffsets);
        valType.freeze();

        bodyValType = new FieldType(TextField.TYPE_NOT_STORED);
        bodyValType.setStored(bodyStored);
        bodyValType.setTokenized(bodyTokenized);
        bodyValType.setOmitNorms(!bodyNorms);
        if (bodyTokenized && bodyOffsets) {
            bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        }
        bodyValType.setStoreTermVectors(termVec);
        bodyValType.setStoreTermVectorPositions(termVecPositions);
        bodyValType.setStoreTermVectorOffsets(termVecOffsets);
        bodyValType.freeze();

        storeBytes = config.get("doc.store.body.bytes", false);

        reuseFields = config.get("doc.reuse.fields", true);

        // In a multi-rounds run, it is important to reset DocState since settings
        // of fields may change between rounds, and this is the only way to reset
        // the cache of all threads.
        docState = new ThreadLocal<>();

        indexProperties = config.get("doc.index.props", false);

        updateDocIDLimit = config.get("doc.random.id.limit", -1);
        if (updateDocIDLimit != -1) {
            r = new Random(179);
        }
    }

}