org.languagetool.dev.FrequencyIndexCreator.java Source code

Introduction

Here is the source code for org.languagetool.dev.FrequencyIndexCreator.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.languagetool.languagemodel.LanguageModel;

import java.io.*;
import java.text.NumberFormat;
import java.util.*;
import java.util.zip.GZIPInputStream;

/**
 * Index *.gz files from Google's ngram corpus into a Lucene index.
 * Index time (1 doc = 1 ngram and its count, years are aggregated into one number):
 * 130s/doc (both on an external USB hard disk or on an internal SSD) = about 7700 docs/sec
 * 
 * <p>The reason this isn't faster is not Lucene but the aggregation work we do or simply
 * the large amount of data. Indexing every line takes 3s/doc, i.e. Lucene can 
 * index about 333,000 docs/s.
 * 
 * <p>Also see http://wiki.languagetool.org/finding-errors-using-big-data.
 * @since 2.7
 */
public class FrequencyIndexCreator {

    private static final int MIN_YEAR = 1910;
    private static final String NAME_REGEX1 = "googlebooks-eng-all-[1-5]gram-20120701-(.*?).gz";
    private static final String NAME_REGEX2 = "[a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+_(.*?).gz"; // Hive result
    private static final int BUFFER_SIZE = 16384;

    private void run(File inputDir, File indexBaseDir) throws IOException {
        List<File> files = Arrays.asList(inputDir.listFiles());
        Collections.sort(files);
        for (File file : files) {
            String name = file.getName();
            if (name.matches(".*_[A-Z]+_.*")) {
                System.out.println("Skipping POS tag file " + name);
                continue;
            }
            File indexDir;
            boolean hiveMode;
            if (name.matches(NAME_REGEX1)) {
                indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX1, "$1"));
                hiveMode = false;
                System.out.println("Running in corpus mode (i.e. aggregation of years)");
            } else if (name.matches(NAME_REGEX2)) {
                indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX2, "$1"));
                hiveMode = true;
                System.out.println("Running in Hive mode (i.e. no aggregation of years)");
            } else {
                System.out.println(
                        "Skipping " + name + " - doesn't match regex " + NAME_REGEX1 + " or " + NAME_REGEX2);
                continue;
            }
            if (indexDir.exists() && indexDir.isDirectory()) {
                System.out.println("Skipping " + name + " - index dir '" + indexDir + "' already exists");
                continue;
            }
            System.out.println("Index dir: " + indexDir);
            Directory directory = FSDirectory.open(indexDir);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_1);
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
            config.setUseCompoundFile(false); // ~10% speedup
            //config.setRAMBufferSizeMB(1000);
            try (IndexWriter writer = new IndexWriter(directory, config)) {
                indexLinesFromGoogleFile(writer, file, hiveMode);
            }
        }
    }

    private void indexLinesFromGoogleFile(IndexWriter writer, File inputFile, boolean hiveMode) throws IOException {
        System.out.println("==== Working on " + inputFile + " ====");
        try (InputStream fileStream = new FileInputStream(inputFile);
                InputStream gzipStream = new GZIPInputStream(fileStream, BUFFER_SIZE);
                Reader decoder = new InputStreamReader(gzipStream, "utf-8");
                BufferedReader buffered = new BufferedReader(decoder, BUFFER_SIZE)) {
            int i = 0;
            long docCount = 0;
            long lineCount = 0;
            String prevText = null;
            long startTime = System.nanoTime() / 1000;
            String line;
            //noinspection NestedAssignment
            while ((line = buffered.readLine()) != null) {
                lineCount++;
                String[] parts = line.split("\t");
                String text = parts[0];
                if (isRealPosTag(text)) {
                    continue;
                }
                if (hiveMode) {
                    String docCountStr = parts[1];
                    addDoc(writer, text, docCountStr);
                    if (++i % 500_000 == 0) {
                        printStats(i, Long.parseLong(docCountStr), lineCount, text, startTime);
                    }
                } else {
                    int year = Integer.parseInt(parts[1]);
                    if (year < MIN_YEAR) {
                        continue;
                    }
                    if (prevText == null || prevText.equals(text)) {
                        // aggregate years
                        docCount += Long.parseLong(parts[2]);
                    } else {
                        //System.out.println(">"+ prevText + ": " + count);
                        addDoc(writer, prevText, docCount + "");
                        if (++i % 5_000 == 0) {
                            printStats(i, docCount, lineCount, prevText, startTime);
                        }
                        docCount = Long.parseLong(parts[2]);
                    }
                }
                prevText = text;
            }
            printStats(i, docCount, lineCount, prevText, startTime);
        }
    }

    private boolean isRealPosTag(String text) {
        int idx = text.indexOf('_');
        if (idx == -1) {
            return false;
        } else {
            String tag = idx + 7 <= text.length() ? text.substring(idx, idx + 7) : ""; // _START_
            if (tag.equals(LanguageModel.GOOGLE_SENTENCE_START)) {
                return false;
            }
            String tag2 = idx + 5 <= text.length() ? text.substring(idx, idx + 5) : ""; // _END_
            if (tag2.equals(LanguageModel.GOOGLE_SENTENCE_END)) {
                return false;
            }
            return true;
        }
    }

    private void printStats(int i, long docCount, long lineCount, String prevText, long startTimeMicros) {
        long microsNow = System.nanoTime() / 1000;
        float millisPerDoc = (microsNow - startTimeMicros) / Math.max(1, i);
        NumberFormat format = NumberFormat.getNumberInstance(Locale.US);
        System.out.printf("doc:%s line:%s ngram:%s occ:%s (%.0fs/doc)\n", format.format(i),
                format.format(lineCount), prevText, format.format(docCount), millisPerDoc);
    }

    private void addDoc(IndexWriter writer, String text, String count) throws IOException {
        Document doc = new Document();
        doc.add(new Field("ngram", text, StringField.TYPE_NOT_STORED));
        FieldType fieldType = new FieldType();
        fieldType.setStored(true);
        Field countField = new Field("count", count, fieldType);
        doc.add(countField);
        writer.addDocument(doc);
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 2) {
            System.out.println("Usage: " + FrequencyIndexCreator.class.getSimpleName() + " <inputDir> <outputDir>");
            System.out.println("    <inputDir> is the Google ngram data aggregated by Hive,");
            System.out.println(
                    "               please see http://wiki.languagetool.org/finding-errors-using-big-data");
            System.exit(1);
        }
        FrequencyIndexCreator creator = new FrequencyIndexCreator();
        creator.run(new File(args[0]), new File(args[1]));
    }
}