org.opengrok.indexer.analysis.plain.PlainAnalyzer.java Source code

Introduction

Here is the source code for org.opengrok.indexer.analysis.plain.PlainAnalyzer.java
Source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * See LICENSE.txt included in this distribution for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at LICENSE.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
 * Portions Copyright (c) 2017-2018, Chris Fraire <cfraire@me.com>.
 */
package org.opengrok.indexer.analysis.plain;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.Writer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.opengrok.indexer.analysis.AnalyzerFactory;
import org.opengrok.indexer.analysis.Definitions;
import org.opengrok.indexer.analysis.ExpandTabsReader;
import org.opengrok.indexer.analysis.JFlexTokenizer;
import org.opengrok.indexer.analysis.JFlexXref;
import org.opengrok.indexer.analysis.OGKTextField;
import org.opengrok.indexer.analysis.OGKTextVecField;
import org.opengrok.indexer.analysis.Scopes;
import org.opengrok.indexer.analysis.StreamSource;
import org.opengrok.indexer.analysis.TextAnalyzer;
import org.opengrok.indexer.analysis.WriteXrefArgs;
import org.opengrok.indexer.analysis.Xrefer;
import org.opengrok.indexer.search.QueryBuilder;
import org.opengrok.indexer.util.NullWriter;

/**
 * Analyzer for plain text files Created on September 21, 2005
 *
 * @author Chandan
 */
public class PlainAnalyzer extends TextAnalyzer {

    /**
     * Creates a new instance of PlainAnalyzer
     * @param factory defined instance for the analyzer
     */
    protected PlainAnalyzer(AnalyzerFactory factory) {
        super(factory);
    }

    /**
     * Creates a new instance of {@link PlainAnalyzer}.
     * @param factory defined instance for the analyzer
     * @param symbolTokenizer defined instance for the analyzer
     */
    protected PlainAnalyzer(AnalyzerFactory factory, JFlexTokenizer symbolTokenizer) {
        super(factory, symbolTokenizer);
    }

    /**
     * Gets a version number to be used to tag processed documents so that
     * re-analysis can be re-done later if a stored version number is different
     * from the current implementation.
     * @return 20180208_00
     */
    @Override
    protected int getSpecializedVersionNo() {
        return 20180208_00; // Edit comment above too!
    }

    /**
     * Creates a wrapped {@link PlainXref} instance.
     * @param reader the data to produce xref for
     * @return an xref instance
     */
    @Override
    protected Xrefer newXref(Reader reader) {
        return new JFlexXref(new PlainXref(reader));
    }

    @Override
    protected Reader getReader(InputStream stream) throws IOException {
        return ExpandTabsReader.wrap(super.getReader(stream), project);
    }

    @Override
    public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException, InterruptedException {
        Definitions defs = null;

        doc.add(new OGKTextField(QueryBuilder.FULL, getReader(src.getStream())));

        String fullpath = doc.get(QueryBuilder.FULLPATH);
        if (fullpath != null && ctags != null) {
            defs = ctags.doCtags(fullpath);
            if (defs != null && defs.numberOfSymbols() > 0) {
                tryAddingDefs(doc, defs, src, fullpath);
                byte[] tags = defs.serialize();
                doc.add(new StoredField(QueryBuilder.TAGS, tags));
            }
        }
        /*
         * This is to explicitly use appropriate analyzer's token stream to
         * work around #1376: symbols search works like full text search.
         */
        OGKTextField ref = new OGKTextField(QueryBuilder.REFS, this.symbolTokenizer);
        this.symbolTokenizer.setReader(getReader(src.getStream()));
        doc.add(ref);

        if (scopesEnabled && xrefOut == null) {
            /*
             * Scopes are generated during xref generation. If xrefs are
             * turned off we still need to run writeXref to produce scopes,
             * we use a dummy writer that will throw away any xref output.
             */
            xrefOut = new NullWriter();
        }

        if (xrefOut != null) {
            try (Reader in = getReader(src.getStream())) {
                WriteXrefArgs args = new WriteXrefArgs(in, xrefOut);
                args.setDefs(defs);
                args.setProject(project);
                Xrefer xref = writeXref(args);

                Scopes scopes = xref.getScopes();
                if (scopes.size() > 0) {
                    byte[] scopesSerialized = scopes.serialize();
                    doc.add(new StoredField(QueryBuilder.SCOPES, scopesSerialized));
                }

                addNumLines(doc, xref.getLineNumber());
                addLOC(doc, xref.getLOC());
            }
        }
    }

    private void tryAddingDefs(Document doc, Definitions defs, StreamSource src, String fullpath)
            throws IOException {

        DefinitionsTokenStream defstream = new DefinitionsTokenStream();
        defstream.initialize(defs, src, (reader) -> wrapReader(reader));

        /**
         *     Testing showed that UnifiedHighlighter will fall back to
         * ANALYSIS in the presence of multi-term queries (MTQs) such as
         * prefixes and wildcards even for fields that are analyzed with
         * POSTINGS -- i.e. with DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS.
         * This is despite UnifiedHighlighter seeming to indicate that
         * postings should be sufficient in the comment for
         * shouldHandleMultiTermQuery(String): "MTQ highlighting can be
         * expensive, particularly when using offsets in postings."
         *     DEFS re-analysis will not be correct, however, as the
         * PlainAnalyzer which UnifiedHighlighter will use on-the-fly will
         * not correctly integrate ctags Definitions.
         *     Storing term vectors, however, allows UnifiedHighlighter to
         * avoid re-analysis at the cost of a larger index. As DEFS are a
         * small subset of source text, it seems worth the cost to get
         * accurate highlighting for DEFS MTQs.
         */
        doc.add(new OGKTextVecField(QueryBuilder.DEFS, defstream));
    }

    /**
     * Identical to {@link #getReader(java.io.InputStream)} but overlaying an
     * existing stream.
     * @see #getReader(java.io.InputStream)
     */
    private Reader wrapReader(Reader reader) {
        return ExpandTabsReader.wrap(reader, project);
    }
}