org.eclipse.che.api.search.server.impl.LuceneSearcher.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.che.api.search.server.impl.LuceneSearcher.java

Source

/*
 * Copyright (c) 2012-2018 Red Hat, Inc.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *   Red Hat, Inc. - initial API and implementation
 */
package org.eclipse.che.api.search.server.impl;

import static com.google.common.collect.Lists.newArrayList;
import static org.eclipse.che.api.fs.server.WsPathUtils.nameOf;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.io.CharStreams;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.MalformedInputException;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import javax.annotation.PostConstruct;
import javax.inject.Inject;
import javax.inject.Named;
import javax.inject.Singleton;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SingleInstanceLockFactory;
import org.apache.lucene.util.BytesRef;
import org.eclipse.che.api.fs.server.PathTransformer;
import org.eclipse.che.api.search.server.InvalidQueryException;
import org.eclipse.che.api.search.server.OffsetData;
import org.eclipse.che.api.search.server.QueryExecutionException;
import org.eclipse.che.api.search.server.QueryExpression;
import org.eclipse.che.api.search.server.SearchResult;
import org.eclipse.che.api.search.server.Searcher;
import org.eclipse.che.commons.schedule.ScheduleRate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Lucene based searcher.
 *
 * @author andrew00x
 * @author Sergii Kabashniuk
 */
@Singleton
public class LuceneSearcher implements Searcher {

    private static final Logger LOG = LoggerFactory.getLogger(LuceneSearcher.class);

    private static final int RESULT_LIMIT = 1000;
    private static final String PATH_FIELD = "path";
    private static final String NAME_FIELD = "name";
    private static final String TEXT_FIELD = "text";

    private final Set<PathMatcher> excludePatterns;
    private final File indexDirectory;
    private final PathTransformer pathTransformer;

    private final File root;
    private final IndexWriter luceneIndexWriter;
    private final SearcherManager searcherManager;
    private final Analyzer analyzer;
    private final CountDownLatch initialIndexingLatch = new CountDownLatch(1);
    private final Sort sort;

    @Inject
    public LuceneSearcher(@Named("vfs.index_filter_matcher") Set<PathMatcher> excludePatterns,
            @Named("vfs.local.fs_index_root_dir") File indexDirectory,
            @Named("che.user.workspaces.storage") File root, PathTransformer pathTransformer) throws IOException {

        if (indexDirectory.exists()) {
            if (indexDirectory.isFile()) {
                throw new IOException("Wrong configuration `vfs.local.fs_index_root_dir` is a file");
            }
        } else {
            Files.createDirectories(indexDirectory.toPath());
        }

        this.indexDirectory = indexDirectory;
        this.root = root;
        this.excludePatterns = excludePatterns;
        this.pathTransformer = pathTransformer;
        this.analyzer = CustomAnalyzer.builder().withTokenizer(WhitespaceTokenizerFactory.class)
                .addTokenFilter(LowerCaseFilterFactory.class).build();
        this.luceneIndexWriter = new IndexWriter(
                FSDirectory.open(indexDirectory.toPath(), new SingleInstanceLockFactory()),
                new IndexWriterConfig(analyzer));
        this.searcherManager = new SearcherManager(luceneIndexWriter, true, true, new SearcherFactory());
        this.sort = new Sort(SortField.FIELD_SCORE, new SortField(PATH_FIELD, SortField.Type.STRING));
    }

    @PostConstruct
    @VisibleForTesting
    void initialize() {
        Thread initializer = new Thread(() -> {
            try {
                long start = System.currentTimeMillis();
                add(root.toPath());
                LOG.info("Initial indexing complete after {} msec ", System.currentTimeMillis() - start);
            } finally {
                initialIndexingLatch.countDown();
            }
        });
        initializer.setName("LuceneSearcherInitThread");
        initializer.setDaemon(true);
        initializer.start();
    }

    @VisibleForTesting
    CountDownLatch getInitialIndexingLatch() {
        return initialIndexingLatch;
    }

    @ScheduleRate(period = 30, initialDelay = 30)
    private void commitIndex() throws IOException {
        luceneIndexWriter.commit();
    }

    @Override
    public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException {
        IndexSearcher luceneSearcher = null;
        try {
            final long startTime = System.currentTimeMillis();
            searcherManager.maybeRefresh();
            luceneSearcher = searcherManager.acquire();

            Query luceneQuery = createLuceneQuery(query);

            ScoreDoc after = null;
            final int numSkipDocs = Math.max(0, query.getSkipCount());
            if (numSkipDocs > 0) {
                after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs);
            }

            final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT)
                    : RESULT_LIMIT;
            TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true);
            final long totalHitsNum = topDocs.totalHits;

            List<SearchResultEntry> results = newArrayList();
            List<OffsetData> offsetData = Collections.emptyList();
            for (int i = 0; i < topDocs.scoreDocs.length; i++) {
                ScoreDoc scoreDoc = topDocs.scoreDocs[i];
                int docId = scoreDoc.doc;
                Document doc = luceneSearcher.doc(docId);
                if (query.isIncludePositions()) {
                    offsetData = new ArrayList<>();
                    String txt = doc.get(TEXT_FIELD);
                    if (txt != null) {
                        IndexReader reader = luceneSearcher.getIndexReader();

                        TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD,
                                reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1);

                        CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
                        OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);

                        QueryScorer queryScorer = new QueryScorer(luceneQuery);
                        // TODO think about this constant
                        queryScorer.setMaxDocCharsToAnalyze(1_000_000);
                        TokenStream newStream = queryScorer.init(tokenStream);
                        if (newStream != null) {
                            tokenStream = newStream;
                        }
                        queryScorer.startFragment(null);

                        tokenStream.reset();

                        int startOffset, endOffset;
                        // TODO think about this constant
                        for (boolean next = tokenStream.incrementToken(); next
                                && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) {
                            startOffset = offsetAtt.startOffset();
                            endOffset = offsetAtt.endOffset();

                            if ((endOffset > txt.length()) || (startOffset > txt.length())) {
                                throw new QueryExecutionException("Token " + termAtt.toString()
                                        + " exceeds length of provided text size " + txt.length());
                            }

                            float res = queryScorer.getTokenScore();
                            if (res > 0.0F && startOffset <= endOffset) {
                                String tokenText = txt.substring(startOffset, endOffset);
                                Scanner sc = new Scanner(txt);
                                int lineNum = 1;
                                long len = 0;
                                String foundLine = "";
                                while (sc.hasNextLine()) {
                                    foundLine = sc.nextLine();

                                    len += foundLine.length();
                                    if (len > startOffset) {
                                        break;
                                    }
                                    lineNum++;
                                }
                                offsetData.add(
                                        new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine));
                            }
                        }
                    }
                }

                String filePath = doc.getField(PATH_FIELD).stringValue();
                LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score);
                results.add(new SearchResultEntry(filePath, offsetData));
            }

            final long elapsedTimeMillis = System.currentTimeMillis() - startTime;

            boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum;
            QueryExpression nextPageQueryExpression = null;
            if (hasMoreToRetrieve) {
                nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length);
            }

            return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum)
                    .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis)
                    .build();
        } catch (ParseException e) {
            throw new InvalidQueryException(e.getMessage(), e);
        } catch (IOException e) {
            throw new QueryExecutionException(e.getMessage(), e);
        } finally {
            try {
                searcherManager.release(luceneSearcher);
            } catch (IOException e) {
                LOG.error(e.getMessage());
            }
        }
    }

    private Query createLuceneQuery(QueryExpression query) throws ParseException, IOException {
        BooleanQuery.Builder luceneQueryBuilder = new BooleanQuery.Builder();
        final String name = query.getName();
        final String path = query.getPath();
        final String text = query.getText();
        if (path != null) {
            luceneQueryBuilder.add(new PrefixQuery(new Term(PATH_FIELD, path)), BooleanClause.Occur.MUST);
        }
        if (name != null) {
            QueryParser qParser = new QueryParser(NAME_FIELD, analyzer);
            qParser.setAllowLeadingWildcard(true);
            luceneQueryBuilder.add(qParser.parse(name), BooleanClause.Occur.MUST);
        }
        if (text != null) {
            QueryParser qParser = new QueryParser(TEXT_FIELD, analyzer);
            qParser.setAllowLeadingWildcard(true);
            luceneQueryBuilder.add(qParser.parse(text), BooleanClause.Occur.MUST);
        }
        return luceneQueryBuilder.build();
    }

    private ScoreDoc skipScoreDocs(IndexSearcher luceneSearcher, Query luceneQuery, int numSkipDocs)
            throws IOException {
        final int readFrameSize = Math.min(numSkipDocs, RESULT_LIMIT);
        ScoreDoc scoreDoc = null;
        int retrievedDocs = 0;
        TopDocs topDocs;
        do {
            topDocs = luceneSearcher.searchAfter(scoreDoc, luceneQuery, readFrameSize, sort, true, true);
            if (topDocs.scoreDocs.length > 0) {
                scoreDoc = topDocs.scoreDocs[topDocs.scoreDocs.length - 1];
            }
            retrievedDocs += topDocs.scoreDocs.length;
        } while (retrievedDocs < numSkipDocs && topDocs.scoreDocs.length > 0);

        if (retrievedDocs > numSkipDocs) {
            int lastScoreDocIndex = topDocs.scoreDocs.length - (retrievedDocs - numSkipDocs);
            scoreDoc = topDocs.scoreDocs[lastScoreDocIndex];
        }

        return scoreDoc;
    }

    private QueryExpression createNextPageQuery(QueryExpression originalQuery, int newSkipCount) {
        return new QueryExpression().setText(originalQuery.getText()).setName(originalQuery.getName())
                .setPath(originalQuery.getPath()).setSkipCount(newSkipCount)
                .setMaxItems(originalQuery.getMaxItems());
    }

    @Override
    public final void add(Path fsPath) {

        try {
            if (fsPath.toFile().isDirectory()) {
                try {
                    Files.walkFileTree(fsPath, new SimpleFileVisitor<Path>() {
                        @Override
                        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
                            addFile(file);
                            return FileVisitResult.CONTINUE;
                        }
                    });
                } catch (IOException ignore) {
                    LOG.warn("Not able to index {} because {} ", fsPath.toString(), ignore.getMessage());
                }
            } else {
                addFile(fsPath);
            }
            printStatistic();
        } catch (IOException e) {
            LOG.warn("Can't commit changes to index for: {} because {} ", fsPath.toAbsolutePath().toString(),
                    e.getMessage());
        }
    }

    private void addFile(Path fsPath) {
        if (!fsPath.toFile().exists()) {
            return;
        }

        if (!isNotExcluded(fsPath)) {
            return;
        }
        String wsPath = pathTransformer.transform(fsPath);
        LOG.debug("Adding file {} ", wsPath);

        try (Reader reader = new BufferedReader(
                new InputStreamReader(new FileInputStream(fsPath.toFile()), "utf-8"))) {
            String name = nameOf(wsPath);
            Document doc = new Document();
            doc.add(new StringField(PATH_FIELD, wsPath, Field.Store.YES));
            doc.add(new SortedDocValuesField(PATH_FIELD, new BytesRef(wsPath)));
            doc.add(new TextField(NAME_FIELD, name, Field.Store.YES));
            try {
                doc.add(new TextField(TEXT_FIELD, CharStreams.toString(reader), Field.Store.YES));
            } catch (MalformedInputException e) {
                LOG.warn("Can't index file: {}", wsPath);
            }
            luceneIndexWriter.updateDocument(new Term(PATH_FIELD, wsPath), doc);

        } catch (IOException oome) {
            LOG.warn("Can't index file: {}", wsPath);
        }
    }

    @Override
    public final void delete(Path fsPath) {

        String wsPath = pathTransformer.transform(fsPath);
        try {

            // Since in most cases this is post action there is no way to find out is this a file
            // or directory. Lets try to delete both
            BooleanQuery.Builder deleteFileOrFolder = new BooleanQuery.Builder();
            deleteFileOrFolder.setMinimumNumberShouldMatch(1);
            deleteFileOrFolder.add(new TermQuery(new Term(PATH_FIELD, wsPath)), Occur.SHOULD);
            deleteFileOrFolder.add(new PrefixQuery(new Term(PATH_FIELD, wsPath + "/")), Occur.SHOULD);
            luceneIndexWriter.deleteDocuments(deleteFileOrFolder.build());
            printStatistic();
        } catch (IOException e) {
            LOG.warn("Can't delete index for file: {}", wsPath);
        }
    }

    private void printStatistic() throws IOException {
        if (LOG.isDebugEnabled()) {
            IndexSearcher luceneSearcher = null;
            try {
                searcherManager.maybeRefresh();
                luceneSearcher = searcherManager.acquire();
                IndexReader reader = luceneSearcher.getIndexReader();
                LOG.debug(
                        "IndexReader numDocs={} numDeletedDocs={} maxDoc={} hasDeletions={}. Writer numDocs={} numRamDocs={} hasPendingMerges={}  hasUncommittedChanges={} hasDeletions={}",
                        reader.numDocs(), reader.numDeletedDocs(), reader.maxDoc(), reader.hasDeletions(),
                        luceneIndexWriter.numDocs(), luceneIndexWriter.numRamDocs(),
                        luceneIndexWriter.hasPendingMerges(), luceneIndexWriter.hasUncommittedChanges(),
                        luceneIndexWriter.hasDeletions());
            } finally {
                searcherManager.release(luceneSearcher);
            }
        }
    }

    @Override
    public final void update(Path fsPath) {
        addFile(fsPath);
    }

    private boolean isNotExcluded(Path fsPath) {
        for (PathMatcher matcher : excludePatterns) {
            if (matcher.matches(fsPath)) {
                return false;
            }
        }
        return true;
    }
}