org.languagetool.dev.FrequencyIndexCreatorTest.java Source code

Introduction

Here is the source code for org.languagetool.dev.FrequencyIndexCreatorTest.java
Source

/* LanguageTool, a natural language style checker 
 * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.junit.Ignore;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.util.Scanner;

public class FrequencyIndexCreatorTest {

    private static final File INDEX_DIR = new File("/media/Data/google-ngram/3gram/lucene-index");

    @Test
    @Ignore("Interactive use only")
    public void testReadPerformance() throws IOException {
        try (FSDirectory directory = FSDirectory.open(INDEX_DIR)) {
            DirectoryReader reader = DirectoryReader.open(directory);
            IndexSearcher searcher = new IndexSearcher(reader);
            try (Scanner scanner = new Scanner(new File("/lt/performance-test/en.txt"))) {
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine();
                    String[] parts = line.split(" ");
                    accessNgrams(parts, searcher);
                }
            }
        }
    }

    private void accessNgrams(String[] parts, IndexSearcher searcher) throws IOException {
        String prevPart = null;
        String prevPrevPart = null;
        for (String part : parts) {
            if (prevPart != null && prevPrevPart != null) {
                String ngram = prevPrevPart + " " + prevPart + " " + part;
                long startTime = System.currentTimeMillis();
                Query query = new TermQuery(new Term("ngram", ngram));
                ScoreDoc[] hits = searcher.search(query, null, 10).scoreDocs;
                //if (hits.length == 0) {
                //  System.out.println("No hit for: " + ngram);
                //}
                for (ScoreDoc hit : hits) {
                    Document hitDoc = searcher.doc(hit.doc);
                    long runTime = System.currentTimeMillis() - startTime;
                    System.out.println(
                            ngram + ": " + hitDoc.getField("count").stringValue() + " (" + runTime + "ms)");
                }
            }
            prevPrevPart = prevPart;
            prevPart = part;
        }
    }
}