org.languagetool.dev.index.Searcher.java Source code

Introduction

Here is the source code for org.languagetool.dev.index.Searcher.java
Source

/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.dev.index;

import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_FIELD;
import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_FIELD_VAL;
import static org.languagetool.dev.dumpcheck.SentenceSourceIndexer.MAX_DOC_COUNT_VALUE;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.FIELD_NAME_LOWERCASE;
import static org.languagetool.dev.index.PatternRuleQueryBuilder.SOURCE_FIELD_NAME;

import java.io.File;
import java.io.IOException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TimeLimitingCollector;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Counter;
import org.languagetool.AnalyzedSentence;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.tools.ContextTools;

/**
 * A class with a main() method that takes a rule id  and the location of the
 * index that runs the query on that index and prints all matches.
 * Will transparently handle rules that are not supported, i.e. run on the candidate matches
 * up to a limit.
 * 
 * @author Tao Lin
 * @author Daniel Naber
 */
public class Searcher {

    private static boolean WIKITEXT_OUTPUT = false;

    private final Directory directory;

    private int maxHits = 1000;
    private int maxSearchTimeMillis = 5000;
    private IndexSearcher indexSearcher;
    private DirectoryReader reader;
    private boolean limitSearch = true;

    public Searcher(Directory directory) {
        this.directory = directory;
    }

    private void open() throws IOException {
        reader = DirectoryReader.open(directory);
        indexSearcher = new IndexSearcher(reader);
        //System.out.println("Opened index " + directory + " with " + indexSearcher.getIndexReader().numDocs() + " docs");
    }

    private void close() throws IOException {
        if (reader != null) {
            reader.close();
        }
    }

    public int getDocCount() throws IOException {
        try (DirectoryReader reader = DirectoryReader.open(directory)) {
            final IndexSearcher indexSearcher = new IndexSearcher(reader);
            return getDocCount(indexSearcher);
        }
    }

    private int getDocCount(IndexSearcher indexSearcher) throws IOException {
        final Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL);
        final TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1);
        if (search.totalHits != 1) {
            return -1;
        }
        final ScoreDoc scoreDoc = search.scoreDocs[0];
        final Document doc = indexSearcher.doc(scoreDoc.doc);
        return Integer.parseInt(doc.get(MAX_DOC_COUNT_VALUE));
    }

    public int getMaxHits() {
        return maxHits;
    }

    public void setMaxHits(int maxHits) {
        this.maxHits = maxHits;
    }

    public int getMaxSearchTimeMillis() {
        return maxSearchTimeMillis;
    }

    public void setMaxSearchTimeMillis(int maxSearchTimeMillis) {
        this.maxSearchTimeMillis = maxSearchTimeMillis;
    }

    public SearcherResult findRuleMatchesOnIndex(PatternRule rule, Language language)
            throws IOException, UnsupportedPatternRuleException {
        // it seems wasteful to re-open the index every time, but I had strange problems (OOM, Array out of bounds, ...)
        // when not doing so...
        open();
        try {
            final PatternRuleQueryBuilder patternRuleQueryBuilder = new PatternRuleQueryBuilder(language);
            final Query query = patternRuleQueryBuilder.buildRelaxedQuery(rule);
            if (query == null) {
                throw new NullPointerException("Cannot search on null query for rule: " + rule.getId());
            }

            System.out.println("Running query: " + query.toString(FIELD_NAME_LOWERCASE));
            final SearchRunnable runnable = new SearchRunnable(indexSearcher, query, language, rule);
            final Thread searchThread = new Thread(runnable);
            searchThread.start();
            try {
                // using a TimeLimitingCollector is not enough, as it doesn't cover all time required to
                // search for a complicated regex, so interrupt the whole thread instead:
                if (limitSearch) { // I don't know a simpler way to achieve this...
                    searchThread.join(maxSearchTimeMillis);
                } else {
                    searchThread.join(Integer.MAX_VALUE);
                }
                searchThread.interrupt();
            } catch (InterruptedException e) {
                throw new RuntimeException("Search thread got interrupted for query " + query, e);
            }
            if (searchThread.isInterrupted()) {
                throw new SearchTimeoutException(
                        "Search timeout of " + maxSearchTimeMillis + "ms reached for query " + query);
            }
            final Exception exception = runnable.getException();
            if (exception != null) {
                if (exception instanceof SearchTimeoutException) {
                    throw (SearchTimeoutException) exception;
                }
                throw new RuntimeException(
                        "Exception during search for query " + query + " on rule " + rule.getId(), exception);
            }

            final List<MatchingSentence> matchingSentences = runnable.getMatchingSentences();
            final int sentencesChecked = getSentenceCheckCount(query, indexSearcher);
            final SearcherResult searcherResult = new SearcherResult(matchingSentences, sentencesChecked, query);
            searcherResult.setHasTooManyLuceneMatches(runnable.hasTooManyLuceneMatches());
            searcherResult.setLuceneMatchCount(runnable.getLuceneMatchCount());
            if (runnable.hasTooManyLuceneMatches()) {
                // more potential matches than we can check in an acceptable time :-(
                searcherResult.setDocCount(maxHits);
            } else {
                searcherResult.setDocCount(getDocCount(indexSearcher));
            }
            //TODO: the search itself could also timeout, don't just ignore that:
            //searcherResult.setResultIsTimeLimited(limitedTopDocs.resultIsTimeLimited);
            return searcherResult;
        } finally {
            close();
        }
    }

    private PossiblyLimitedTopDocs getTopDocs(Query query, Sort sort) throws IOException {
        final TopFieldCollector topCollector = TopFieldCollector.create(sort, maxHits, true, false, false, false);
        final Counter clock = Counter.newCounter(true);
        final int waitMillis = 1000;
        // TODO: if we interrupt the whole thread anyway, do we still need the TimeLimitingCollector?
        final TimeLimitingCollector collector = new TimeLimitingCollector(topCollector, clock,
                maxSearchTimeMillis / waitMillis);
        collector.setBaseline(0);
        final Thread counterThread = new Thread() {
            @Override
            public void run() {
                final long startTime = System.currentTimeMillis();
                while (true) {
                    final long runTimeMillis = System.currentTimeMillis() - startTime;
                    if (runTimeMillis > maxSearchTimeMillis) {
                        // make sure there's no lingering thread for too long
                        return;
                    }
                    clock.addAndGet(1);
                    try {
                        Thread.sleep(waitMillis);
                    } catch (InterruptedException e) {
                        throw new RuntimeException(e);
                    }
                }
            }
        };
        counterThread.setName("LuceneSearchTimeoutThread");
        counterThread.start();

        boolean timeLimitActivated = false;
        try {
            indexSearcher.search(query, collector);
        } catch (TimeLimitingCollector.TimeExceededException e) {
            timeLimitActivated = true;
        }
        return new PossiblyLimitedTopDocs(topCollector.topDocs(), timeLimitActivated);
    }

    List<PatternRule> getRuleById(String ruleId, Language language) throws IOException {
        List<PatternRule> rules = new ArrayList<>();
        JLanguageTool langTool = new JLanguageTool(language);
        langTool.activateDefaultPatternRules();
        for (Rule rule : langTool.getAllRules()) {
            if (rule.getId().equals(ruleId) && rule instanceof PatternRule) {
                rules.add((PatternRule) rule);
            }
        }
        if (rules.size() > 0) {
            return rules;
        } else {
            throw new PatternRuleNotFoundException(ruleId, language);
        }
    }

    private int getSentenceCheckCount(Query query, IndexSearcher indexSearcher) {
        final int indexSize = indexSearcher.getIndexReader().numDocs();
        // we actually check up to maxHits sentences:
        // TODO: ??
        final int sentencesChecked = Math.min(maxHits, indexSize);
        return sentencesChecked;
    }

    private List<MatchingSentence> findMatchingSentences(IndexSearcher indexSearcher, TopDocs topDocs,
            JLanguageTool languageTool) throws IOException {
        final List<MatchingSentence> matchingSentences = new ArrayList<>();
        for (ScoreDoc match : topDocs.scoreDocs) {
            final Document doc = indexSearcher.doc(match.doc);
            final String sentence = doc.get(FIELD_NAME);
            final List<RuleMatch> ruleMatches = languageTool.check(sentence);
            if (ruleMatches.size() > 0) {
                final String source = doc.get(SOURCE_FIELD_NAME);
                final String title = doc.get(Indexer.TITLE_FIELD_NAME);
                final AnalyzedSentence analyzedSentence = languageTool.getAnalyzedSentence(sentence);
                final MatchingSentence matchingSentence = new MatchingSentence(sentence, source, title,
                        analyzedSentence, ruleMatches);
                matchingSentences.add(matchingSentence);
            }
        }
        return matchingSentences;
    }

    private JLanguageTool getLanguageToolWithOneRule(Language lang, PatternRule patternRule) {
        final JLanguageTool langTool = new JLanguageTool(lang);
        for (Rule rule : langTool.getAllActiveRules()) {
            langTool.disableRule(rule.getId());
        }
        langTool.addRule(patternRule);
        langTool.enableDefaultOffRule(patternRule.getId()); // rule might be off by default
        return langTool;
    }

    class PossiblyLimitedTopDocs {
        TopDocs topDocs;
        boolean resultIsTimeLimited;

        PossiblyLimitedTopDocs(TopDocs topDocs, boolean resultIsTimeLimited) {
            this.topDocs = topDocs;
            this.resultIsTimeLimited = resultIsTimeLimited;
        }
    }

    private static void ensureCorrectUsageOrExit(String[] args) {
        if (args.length < 3 || (args.length == 4 && !"--no_limit".equals(args[3]))) {
            System.err.println("Usage: Searcher <ruleId> <languageCode> <indexDir> [--no_limit]");
            System.err.println("\truleId       Id of the rule to search for (or comma-separated list of ids)");
            System.err.println("\tlanguageCode short language code, e.g. 'en' for English");
            System.err.println("\tindexDir     path to a directory containing the index");
            System.err.println("\t--no_limit   do not limit search time");
            System.exit(1);
        }
    }

    class SearchRunnable implements Runnable {

        private final IndexSearcher indexSearcher;
        private final Query query;
        private final Language language;
        private final PatternRule rule;

        private List<MatchingSentence> matchingSentences;
        private Exception exception;
        private boolean tooManyLuceneMatches;
        private int luceneMatchCount;

        SearchRunnable(IndexSearcher indexSearcher, Query query, Language language, PatternRule rule) {
            this.indexSearcher = indexSearcher;
            this.query = query;
            this.language = language;
            this.rule = rule;
        }

        @Override
        public void run() {
            try {
                final Sort sort = new Sort(new SortField("docCount", SortField.Type.INT)); // do not sort by relevance as this will move the shortest documents to the top
                final long t1 = System.currentTimeMillis();
                final JLanguageTool languageTool = getLanguageToolWithOneRule(language, rule);
                final long langToolCreationTime = System.currentTimeMillis() - t1;
                final long t2 = System.currentTimeMillis();
                final PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query, sort);
                final long luceneTime = System.currentTimeMillis() - t2;
                final long t3 = System.currentTimeMillis();
                luceneMatchCount = limitedTopDocs.topDocs.totalHits;
                tooManyLuceneMatches = limitedTopDocs.topDocs.scoreDocs.length >= maxHits;
                matchingSentences = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool);
                System.out.println("Check done in " + langToolCreationTime + "/" + luceneTime + "/"
                        + (System.currentTimeMillis() - t3) + "ms (LT creation/Lucene/matching) for "
                        + limitedTopDocs.topDocs.scoreDocs.length + " docs");
            } catch (Exception e) {
                exception = e;
            }
        }

        Exception getException() {
            return exception;
        }

        /**
         * There were more Lucene matches than we can actually check with LanguageTool in
         * an acceptable time, so real matches might be lost.
         */
        boolean hasTooManyLuceneMatches() {
            return tooManyLuceneMatches;
        }

        int getLuceneMatchCount() {
            return luceneMatchCount;
        }

        List<MatchingSentence> getMatchingSentences() {
            return matchingSentences;
        }
    }

    private static ContextTools getContextTools(int contextSize) {
        final ContextTools contextTools = new ContextTools();
        contextTools.setEscapeHtml(false);
        contextTools.setContextSize(contextSize);
        contextTools.setErrorMarkerStart("**");
        contextTools.setErrorMarkerEnd("**");
        return contextTools;
    }

    public static void main(String[] args) throws Exception {
        ensureCorrectUsageOrExit(args);
        final long startTime = System.currentTimeMillis();
        final String[] ruleIds = args[0].split(",");
        final String languageCode = args[1];
        final Language language = Language.getLanguageForShortName(languageCode);
        final File indexDir = new File(args[2]);
        final boolean limitSearch = args.length > 3 && "--no_limit".equals(args[3]);
        final Searcher searcher = new Searcher(new SimpleFSDirectory(indexDir));
        if (!limitSearch) {
            searcher.setMaxHits(100_000);
        }
        searcher.limitSearch = limitSearch;
        final ContextTools contextTools = getContextTools(140);
        int totalMatches = 0;
        for (String ruleId : ruleIds) {
            final long ruleStartTime = System.currentTimeMillis();
            for (PatternRule rule : searcher.getRuleById(ruleId, language)) {
                System.out.println("===== " + ruleId + "[" + rule.getSubId()
                        + "] =========================================================");
                final SearcherResult searcherResult = searcher.findRuleMatchesOnIndex(rule, language);
                int i = 1;
                if (searcherResult.getMatchingSentences().size() == 0) {
                    System.out.println("[no matches]");
                }
                for (MatchingSentence ruleMatch : searcherResult.getMatchingSentences()) {
                    for (RuleMatch match : ruleMatch.getRuleMatches()) {
                        String context = contextTools.getContext(match.getFromPos(), match.getToPos(),
                                ruleMatch.getSentence());
                        if (WIKITEXT_OUTPUT) {
                            ContextTools contextTools2 = getContextTools(0);
                            String coveredText = contextTools2.getContext(match.getFromPos(), match.getToPos(),
                                    ruleMatch.getSentence());
                            coveredText = coveredText.replaceFirst("^\\.\\.\\.", "").replaceFirst("\\.\\.\\.$", "");
                            coveredText = coveredText.replaceFirst("^\\*\\*", "").replaceFirst("\\*\\*$", "");
                            String encodedTextWithQuotes = URLEncoder.encode("\"" + coveredText + "\"", "UTF-8");
                            String searchLink = "https://de.wikipedia.org/w/index.php?search="
                                    + encodedTextWithQuotes + "&title=Spezial%3ASuche&go=Artikel";
                            context = context.replaceAll("\\*\\*.*?\\*\\*",
                                    "[" + searchLink + " " + coveredText + "]");
                            String encTitle = URLEncoder.encode(ruleMatch.getTitle(), "UTF-8");
                            String encodedText = URLEncoder.encode(coveredText, "UTF-8");
                            System.out.println("# [[" + ruleMatch.getTitle() + "]]: " + context
                                    + " ([http://wikipedia.ramselehof.de/wikiblame.php?user_lang=de&lang=de&project=wikipedia&article="
                                    + encTitle + "&needle=" + encodedText
                                    + "&skipversions=0&ignorefirst=0&limit=500&searchmethod=int&order=desc&start=Start WikiBlame])");
                        } else {
                            System.out.println(i + ": " + context + " [" + ruleMatch.getSource() + "]");
                        }
                    }
                    totalMatches += ruleMatch.getRuleMatches().size();
                    i++;
                }
                System.out.println("Time: " + (System.currentTimeMillis() - ruleStartTime) + "ms");
            }
        }
        System.out.println(
                "Total time: " + (System.currentTimeMillis() - startTime) + "ms, " + totalMatches + " matches");
    }

}