IndexTaxis.java Source code

Introduction

Here is the source code for IndexTaxis.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.PrintStreamInfoStream;

// See README.nyctaxis for the chunked docs source

// First: cd to /foo/bar/baz/lucene-solr-clone/lucene

// Then: ant clean jar"

// Then: javac -cp build/core/classes/java /l/util/src/main/perf/IndexTaxis.java ; java -cp build/core/classes/java:/l/util/src/main/perf IndexTaxis /c/taxisjava 1 /lucenedata/nyc-taxi-data/alltaxis.25M.csv.blocks 

public class IndexTaxis {

    private static final int NEWLINE = (byte) '\n';
    private static final int COMMA = (byte) ',';
    private static final byte[] header = new byte[128];

    static long startNS;

    private static class Chunk {
        public final byte[] bytes;
        public final int docCount;

        public Chunk(byte[] bytes, int docCount) {
            this.bytes = bytes;
            this.docCount = docCount;
        }
    }

    private synchronized static Chunk readChunk(BufferedInputStream docs) throws IOException {
        int count = docs.read(header, 0, header.length);
        if (count == -1) {
            // end
            return null;
        }

        int upto = 0;
        while (upto < header.length) {
            if (header[upto] == NEWLINE) {
                break;
            }
            upto++;
        }
        if (upto == header.length) {
            throw new AssertionError();
        }
        String[] parts = new String(header, 0, upto, StandardCharsets.UTF_8).split(" ");
        if (parts.length != 2) {
            throw new AssertionError();
        }
        int byteCount = Integer.parseInt(parts[0]);
        int docCount = Integer.parseInt(parts[1]);
        byte[] chunk = new byte[byteCount];
        int fragment = header.length - upto - 1;
        System.arraycopy(header, upto + 1, chunk, 0, fragment);
        count = docs.read(chunk, fragment, chunk.length - fragment);
        if (count != chunk.length - fragment) {
            throw new AssertionError();
        }
        return new Chunk(chunk, docCount);
    }

    static void addOneField(Document doc, String fieldName, String rawValue) {
        // nocommit
        /*
        if (fieldName.equals("pick_up_lat")) {
          double value = Double.parseDouble(rawValue);
          doc.add(new DoublePoint(fieldName, value));
          doc.add(new SortedNumericDocValuesField(fieldName, NumericUtils.doubleToSortableLong(value)));
        }
        */
        switch (fieldName) {
        case "vendor_id":
        case "cab_color":
        case "payment_type":
        case "trip_type":
        case "rate_code":
        case "store_and_fwd_flag":
            doc.add(new StringField(fieldName, rawValue, Field.Store.NO));
            doc.add(new SortedSetDocValuesField(fieldName, new BytesRef(rawValue)));
            break;
        case "vendor_name":
            doc.add(new TextField(fieldName, rawValue, Field.Store.NO));
            break;
        case "pick_up_date_time":
        case "drop_off_date_time": {
            long value = Long.parseLong(rawValue);
            doc.add(new LongPoint(fieldName, value));
            doc.add(new SortedNumericDocValuesField(fieldName, value));
        }
            break;
        case "passenger_count": {
            int value = Integer.parseInt(rawValue);
            doc.add(new IntPoint(fieldName, value));
            doc.add(new SortedNumericDocValuesField(fieldName, value));
        }
            break;
        case "trip_distance":
        case "pick_up_lat":
        case "pick_up_lon":
        case "drop_off_lat":
        case "drop_off_lon":
        case "fare_amount":
        case "surcharge":
        case "mta_tax":
        case "extra":
        case "ehail_fee":
        case "improvement_surcharge":
        case "tip_amount":
        case "tolls_amount":
        case "total_amount": {
            double value;
            try {
                value = Double.parseDouble(rawValue);
            } catch (NumberFormatException nfe) {
                System.out.println(
                        "WARNING: failed to parse \"" + rawValue + "\" as double for field \"" + fieldName + "\"");
                return;
            }
            doc.add(new DoublePoint(fieldName, value));
            doc.add(new SortedNumericDocValuesField(fieldName, NumericUtils.doubleToSortableLong(value)));
        }
            break;
        default:
            throw new AssertionError("failed to handle field \"" + fieldName + "\"");
        }
    }

    /** Index all documents contained in one chunk */
    static void indexOneChunk(String[] fields, Chunk chunk, IndexWriter w, AtomicInteger docCounter,
            AtomicLong bytesCounter) throws IOException {

        Document doc = new Document();
        byte[] bytes = chunk.bytes;
        if (bytes[bytes.length - 1] != NEWLINE) {
            throw new AssertionError();
        }
        w.addDocuments(new Iterable<Document>() {
            @Override
            public Iterator<Document> iterator() {
                return new Iterator<Document>() {
                    private int i;
                    private Document nextDoc;
                    private boolean nextSet;
                    private int lastLineStart;
                    private int chunkDocCount;

                    @Override
                    public boolean hasNext() {
                        if (nextSet == false) {
                            setNextDoc();
                            nextSet = true;
                        }

                        return nextDoc != null;
                    }

                    @Override
                    public Document next() {
                        assert nextSet;
                        nextSet = false;
                        Document result = nextDoc;
                        nextDoc = null;
                        return result;
                    }

                    private void setNextDoc() {
                        Document doc = new Document();
                        int fieldUpto = 0;
                        int lastFieldStart = i;
                        for (; i < bytes.length; i++) {
                            byte b = bytes[i];
                            if (b == NEWLINE || b == COMMA) {
                                if (i > lastFieldStart) {
                                    String s = new String(bytes, lastFieldStart, i - lastFieldStart,
                                            StandardCharsets.UTF_8);
                                    addOneField(doc, fields[fieldUpto], s);
                                }
                                if (b == NEWLINE) {
                                    if (fieldUpto != fields.length - 1) {
                                        throw new AssertionError("fieldUpto=" + fieldUpto + " vs fields.length-1="
                                                + (fields.length - 1));
                                    }
                                    chunkDocCount++;
                                    this.nextDoc = doc;
                                    int x = docCounter.incrementAndGet();
                                    long y = bytesCounter.addAndGet((i + 1) - lastLineStart);
                                    if (x % 100000 == 0) {
                                        double sec = (System.nanoTime() - startNS) / 1000000000.0;
                                        System.out.println(String.format(Locale.ROOT,
                                                "%.1f sec: %d docs; %.1f docs/sec; %.1f MB/sec", sec, x, x / sec,
                                                (y / 1024. / 1024.) / sec));
                                    }
                                    fieldUpto = 0;
                                    i++;
                                    lastLineStart = i;
                                    return;
                                } else {
                                    fieldUpto++;
                                }
                                lastFieldStart = i + 1;
                            }
                        }
                        // System.out.println("chunk doc count: " + chunkDocCount);
                    }
                };
            }
        });
    }

    public static void main(String[] args) throws Exception {
        Path indexPath = Paths.get(args[0]);
        Directory dir = FSDirectory.open(indexPath);
        int threadCount = Integer.parseInt(args[1]);
        Path docsPath = Paths.get(args[2]);

        IndexWriterConfig iwc = new IndexWriterConfig();
        //System.out.println("NOW SET INFO STREAM");
        iwc.setRAMBufferSizeMB(1024.);
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        iwc.setInfoStream(new PrintStreamInfoStream(System.out));
        //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).disableAutoIOThrottle();

        final IndexWriter w = new IndexWriter(dir, iwc);

        BufferedInputStream docs = new BufferedInputStream(Files.newInputStream(docsPath, StandardOpenOption.READ));

        // parse the header fields
        List<String> fieldsList = new ArrayList<>();
        StringBuilder builder = new StringBuilder();
        while (true) {
            int x = docs.read();
            if (x == -1) {
                throw new IllegalArgumentException(
                        "hit EOF while trying to read CSV header; are you sure you have the right CSV file!");
            }
            byte b = (byte) x;
            if (b == NEWLINE) {
                fieldsList.add(builder.toString());
                break;
            } else if (b == COMMA) {
                fieldsList.add(builder.toString());
                builder.setLength(0);
            } else {
                // this is OK because headers are all ascii:
                builder.append((char) b);
            }
        }

        final String[] fields = fieldsList.toArray(new String[fieldsList.size()]);

        Thread[] threads = new Thread[threadCount];

        final AtomicInteger docCounter = new AtomicInteger();
        final AtomicLong bytesCounter = new AtomicLong();

        startNS = System.nanoTime();

        for (int i = 0; i < threadCount; i++) {
            final int threadID = i;
            threads[i] = new Thread() {
                @Override
                public void run() {
                    try {
                        _run();
                    } catch (Exception e) {
                        throw new RuntimeException(e);
                    }
                }

                private void _run() throws IOException {
                    while (true) {
                        Chunk chunk = readChunk(docs);
                        if (chunk == null) {
                            break;
                        }
                        indexOneChunk(fields, chunk, w, docCounter, bytesCounter);
                    }
                }
            };
            threads[i].start();
        }

        for (int i = 0; i < threadCount; i++) {
            threads[i].join();
        }
        System.out.println("Indexing done; now close");

        w.close();
        docs.close();
    }
}