org.apache.lucene.util.LineFileDocs.java Source code

Introduction

Here is the source code for org.apache.lucene.util.LineFileDocs.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;

/** Minimal port of benchmark's LneDocSource +
 * DocMaker, so tests can enum docs from a line file created
 * by benchmark's WriteLineDoc task */
public class LineFileDocs implements Closeable {

    private BufferedReader reader;
    private final static int BUFFER_SIZE = 1 << 16; // 64K
    private final AtomicInteger id = new AtomicInteger();
    private final String path;
    private final Random random;

    /** If forever is true, we rewind the file at EOF (repeat
     * the docs over and over) */
    public LineFileDocs(Random random, String path) throws IOException {
        this.path = path;
        this.random = new Random(random.nextLong());
        open(random);
    }

    public LineFileDocs(Random random) throws IOException {
        this(random, LuceneTestCase.TEST_LINE_DOCS_FILE);
    }

    @Override
    public synchronized void close() throws IOException {
        IOUtils.close(reader, threadDocs);
        reader = null;
    }

    private long randomSeekPos(Random random, long size) {
        if (random == null || size <= 3L)
            return 0L;
        return (random.nextLong() & Long.MAX_VALUE) % (size / 3);
    }

    private synchronized void open(Random random) throws IOException {
        InputStream is = getClass().getResourceAsStream(path);
        boolean needSkip = true;
        long size = 0L, seekTo = 0L;
        if (is == null) {
            // if it's not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
            Path file = Paths.get(path);
            size = Files.size(file);
            if (path.endsWith(".gz")) {
                // if it is a gzip file, we need to use InputStream and slowly skipTo:
                is = Files.newInputStream(file);
            } else {
                // optimized seek using SeekableByteChannel
                seekTo = randomSeekPos(random, size);
                final SeekableByteChannel channel = Files.newByteChannel(file);
                if (LuceneTestCase.VERBOSE) {
                    System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open");
                }
                channel.position(seekTo);
                is = Channels.newInputStream(channel);
                needSkip = false;
            }
        } else {
            // if the file comes from Classpath:
            size = is.available();
        }

        if (path.endsWith(".gz")) {
            is = new GZIPInputStream(is);
            // guestimate:
            size *= 2.8;
        }

        // If we only have an InputStream, we need to seek now,
        // but this seek is a scan, so very inefficient!!!
        if (needSkip) {
            seekTo = randomSeekPos(random, size);
            if (LuceneTestCase.VERBOSE) {
                System.out.println("TEST: LineFileDocs: stream skip to fp=" + seekTo + " on open");
            }
            is.skip(seekTo);
        }

        // if we seeked somewhere, read until newline char
        if (seekTo > 0L) {
            int b;
            do {
                b = is.read();
            } while (b >= 0 && b != 13 && b != 10);
        }

        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
                .onUnmappableCharacter(CodingErrorAction.REPORT);
        reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

        if (seekTo > 0L) {
            // read one more line, to make sure we are not inside a Windows linebreak (\r\n):
            reader.readLine();
        }
    }

    public synchronized void reset(Random random) throws IOException {
        reader.close();
        reader = null;
        open(random);
        id.set(0);
    }

    private final static char SEP = '\t';

    private static final class DocState {
        final Document doc;
        final Field titleTokenized;
        final Field title;
        final Field titleDV;
        final Field body;
        final Field id;
        final Field idNum;
        final Field idNumDV;
        final Field date;

        public DocState() {
            doc = new Document();

            title = new StringField("title", "", Field.Store.NO);
            doc.add(title);

            FieldType ft = new FieldType(TextField.TYPE_STORED);
            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
            ft.setStoreTermVectors(true);
            ft.setStoreTermVectorOffsets(true);
            ft.setStoreTermVectorPositions(true);

            titleTokenized = new Field("titleTokenized", "", ft);
            doc.add(titleTokenized);

            body = new Field("body", "", ft);
            doc.add(body);

            id = new StringField("docid", "", Field.Store.YES);
            doc.add(id);

            idNum = new IntPoint("docid_int", 0);
            doc.add(idNum);

            date = new StringField("date", "", Field.Store.YES);
            doc.add(date);

            titleDV = new SortedDocValuesField("titleDV", new BytesRef());
            idNumDV = new NumericDocValuesField("docid_intDV", 0);
            doc.add(titleDV);
            doc.add(idNumDV);
        }
    }

    private final CloseableThreadLocal<DocState> threadDocs = new CloseableThreadLocal<>();

    /** Note: Document instance is re-used per-thread */
    public Document nextDoc() throws IOException {
        String line;
        synchronized (this) {
            line = reader.readLine();
            if (line == null) {
                // Always rewind at end:
                if (LuceneTestCase.VERBOSE) {
                    System.out.println("TEST: LineFileDocs: now rewind file...");
                }
                reader.close();
                reader = null;
                open(null);
                line = reader.readLine();
            }
        }

        DocState docState = threadDocs.get();
        if (docState == null) {
            docState = new DocState();
            threadDocs.set(docState);
        }

        int spot = line.indexOf(SEP);
        if (spot == -1) {
            throw new RuntimeException("line: [" + line + "] is in an invalid format !");
        }
        int spot2 = line.indexOf(SEP, 1 + spot);
        if (spot2 == -1) {
            throw new RuntimeException("line: [" + line + "] is in an invalid format !");
        }

        docState.body.setStringValue(line.substring(1 + spot2, line.length()));
        final String title = line.substring(0, spot);
        docState.title.setStringValue(title);
        if (docState.titleDV != null) {
            docState.titleDV.setBytesValue(new BytesRef(title));
        }
        docState.titleTokenized.setStringValue(title);
        docState.date.setStringValue(line.substring(1 + spot, spot2));
        final int i = id.getAndIncrement();
        docState.id.setStringValue(Integer.toString(i));
        docState.idNum.setIntValue(i);
        if (docState.idNumDV != null) {
            docState.idNumDV.setLongValue(i);
        }

        if (random.nextInt(5) == 4) {
            // Make some sparse fields
            Document doc = new Document();
            for (IndexableField field : docState.doc) {
                doc.add(field);
            }

            if (random.nextInt(3) == 1) {
                int x = random.nextInt(4);
                doc.add(new IntPoint("docLength" + x, line.length()));
            }

            if (random.nextInt(3) == 1) {
                int x = random.nextInt(4);
                doc.add(new IntPoint("docTitleLength" + x, title.length()));
            }

            if (random.nextInt(3) == 1) {
                int x = random.nextInt(4);
                doc.add(new NumericDocValuesField("docLength" + x, line.length()));
            }

            // TODO: more random sparse fields here too
        }

        return docState.doc;
    }
}