org.sleuthkit.autopsy.keywordsearch.HighlightedTextMarkup.java Source code

Introduction

Here is the source code for org.sleuthkit.autopsy.keywordsearch.HighlightedTextMarkup.java
Source

/*
 * Autopsy Forensic Browser
 *
 * Copyright 2011-2015 Basis Technology Corp.
 * Contact: carrier <at> sleuthkit <dot> org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sleuthkit.autopsy.keywordsearch;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.logging.Level;

import org.openide.util.NbBundle;
import org.sleuthkit.autopsy.coreutils.Logger;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.sleuthkit.autopsy.coreutils.Version;
import org.sleuthkit.autopsy.datamodel.TextMarkupLookup;
import org.sleuthkit.autopsy.keywordsearch.KeywordQueryFilter.FilterType;
import org.sleuthkit.datamodel.Content;

/**
 * Highlights hits for a given document. Knows about pages and such for the content viewer. 
 */
class HighlightedTextMarkup implements TextMarkup, TextMarkupLookup {

    private static final Logger logger = Logger.getLogger(HighlightedTextMarkup.class.getName());
    private static final String HIGHLIGHT_PRE = "<span style='background:yellow'>"; //NON-NLS
    private static final String HIGHLIGHT_POST = "</span>"; //NON-NLS
    private static final String ANCHOR_PREFIX = HighlightedTextMarkup.class.getName() + "_";

    private long objectId;
    private String keywordHitQuery;
    private Server solrServer;
    private int numberPages;
    private int currentPage;
    private boolean isRegex = false;
    private boolean group = true;
    private boolean hasChunks = false;
    //stores all pages/chunks that have hits as key, and number of hits as a value, or 0 if yet unknown
    private LinkedHashMap<Integer, Integer> hitsPages;
    //stored page num -> current hit number mapping
    private HashMap<Integer, Integer> pagesToHits;
    private List<Integer> pages;
    private QueryResults hits = null; //original hits that may get passed in
    private String originalQuery = null; //or original query if hits are not available
    private boolean isPageInfoLoaded = false;
    private static final boolean DEBUG = (Version.getBuildType() == Version.Type.DEVELOPMENT);

    HighlightedTextMarkup(long objectId, String keywordHitQuery, boolean isRegex) {
        this.objectId = objectId;
        this.keywordHitQuery = keywordHitQuery;
        this.isRegex = isRegex;
        this.group = true;
        this.hitsPages = new LinkedHashMap<>();
        this.pages = new ArrayList<>();
        this.pagesToHits = new HashMap<>();

        this.solrServer = KeywordSearch.getServer();
        this.numberPages = 0;
        this.currentPage = 0;
        //hits are unknown

    }

    //when the results are not known and need to requery to get hits
    HighlightedTextMarkup(long objectId, String solrQuery, boolean isRegex, String originalQuery) {
        this(objectId, solrQuery, isRegex);
        this.originalQuery = originalQuery;
    }

    HighlightedTextMarkup(long objectId, String solrQuery, boolean isRegex, QueryResults hits) {
        this(objectId, solrQuery, isRegex);
        this.hits = hits;
    }

    HighlightedTextMarkup(long objectId, String solrQuery, boolean isRegex, boolean group, QueryResults hits) {
        this(objectId, solrQuery, isRegex, hits);
        this.group = group;
    }

    /** 
     * The main goal of this method is to figure out which pages / chunks have hits.
     */
    private void loadPageInfo() {
        if (isPageInfoLoaded) {
            return;
        }
        try {
            this.numberPages = solrServer.queryNumFileChunks(this.objectId);
        } catch (KeywordSearchModuleException ex) {
            logger.log(Level.WARNING, "Could not get number pages for content: " + this.objectId); //NON-NLS
            return;
        } catch (NoOpenCoreException ex) {
            logger.log(Level.WARNING, "Could not get number pages for content: " + this.objectId); //NON-NLS
            return;
        }

        if (this.numberPages == 0) {
            hasChunks = false;
        } else {
            hasChunks = true;
        }

        //if has chunks, get pages with hits
        if (hasChunks) {
            //extract pages of interest, sorted

            /* If this is being called from the artifacts / dir tree, then we
             * need to perform the search to get the highlights. 
             */
            if (hits == null) {
                String queryStr = KeywordSearchUtil.escapeLuceneQuery(this.keywordHitQuery);
                if (isRegex) {
                    //use white-space sep. field to get exact matches only of regex query result
                    queryStr = Server.Schema.CONTENT_WS + ":" + "\"" + queryStr + "\"";
                }

                Keyword keywordQuery = new Keyword(queryStr, !isRegex);
                List<Keyword> keywords = new ArrayList<>();
                keywords.add(keywordQuery);
                KeywordSearchQuery chunksQuery = new LuceneQuery(new KeywordList(keywords), keywordQuery);

                chunksQuery.addFilter(new KeywordQueryFilter(FilterType.CHUNK, this.objectId));
                try {
                    hits = chunksQuery.performQuery();
                } catch (NoOpenCoreException ex) {
                    logger.log(Level.INFO, "Could not get chunk info and get highlights", ex); //NON-NLS
                    return;
                }
            }

            //organize the hits by page, filter as needed
            TreeSet<Integer> pagesSorted = new TreeSet<>();
            for (Keyword k : hits.getKeywords()) {
                for (KeywordHit hit : hits.getResults(k)) {
                    int chunkID = hit.getChunkId();
                    if (chunkID != 0 && this.objectId == hit.getSolrObjectId()) {
                        pagesSorted.add(chunkID);
                    }
                }
            }

            //set page to first page having highlights
            if (pagesSorted.isEmpty()) {
                this.currentPage = 0;
            } else {
                this.currentPage = pagesSorted.first();
            }

            for (Integer page : pagesSorted) {
                hitsPages.put(page, 0); //unknown number of matches in the page
                pages.add(page);
                pagesToHits.put(page, 0); //set current hit to 0th
            }

        } else {
            //no chunks
            this.numberPages = 1;
            this.currentPage = 1;
            hitsPages.put(1, 0);
            pages.add(1);
            pagesToHits.put(1, 0);
        }
        isPageInfoLoaded = true;
    }

    //constructor for dummy singleton factory instance for Lookup
    private HighlightedTextMarkup() {
    }

    long getObjectId() {
        return this.objectId;
    }

    @Override
    public int getNumberPages() {
        return this.numberPages;
        //return number of pages that have hits
        //return this.hitsPages.keySet().size();
    }

    @Override
    public int getCurrentPage() {
        return this.currentPage;
    }

    @Override
    public boolean hasNextPage() {
        final int numPages = pages.size();
        int idx = pages.indexOf(this.currentPage);
        return idx < numPages - 1;

    }

    @Override
    public boolean hasPreviousPage() {
        int idx = pages.indexOf(this.currentPage);
        return idx > 0;

    }

    @Override
    public int nextPage() {
        if (!hasNextPage()) {
            throw new IllegalStateException(
                    NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.nextPage.exception.msg"));
        }
        int idx = pages.indexOf(this.currentPage);
        currentPage = pages.get(idx + 1);
        return currentPage;
    }

    @Override
    public int previousPage() {
        if (!hasPreviousPage()) {
            throw new IllegalStateException(
                    NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.previousPage.exception.msg"));
        }
        int idx = pages.indexOf(this.currentPage);
        currentPage = pages.get(idx - 1);
        return currentPage;
    }

    @Override
    public boolean hasNextItem() {
        if (!this.pagesToHits.containsKey(currentPage)) {
            return false;
        }
        return this.pagesToHits.get(currentPage) < this.hitsPages.get(currentPage);
    }

    @Override
    public boolean hasPreviousItem() {
        if (!this.pagesToHits.containsKey(currentPage)) {
            return false;
        }
        return this.pagesToHits.get(currentPage) > 1;
    }

    @Override
    public int nextItem() {
        if (!hasNextItem()) {
            throw new IllegalStateException(
                    NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.nextItem.exception.msg"));
        }
        int cur = pagesToHits.get(currentPage) + 1;
        pagesToHits.put(currentPage, cur);
        return cur;
    }

    @Override
    public int previousItem() {
        if (!hasPreviousItem()) {
            throw new IllegalStateException(
                    NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.previousItem.exception.msg"));
        }
        int cur = pagesToHits.get(currentPage) - 1;
        pagesToHits.put(currentPage, cur);
        return cur;
    }

    @Override
    public int currentItem() {
        if (!this.pagesToHits.containsKey(currentPage)) {
            return 0;
        }
        return pagesToHits.get(currentPage);
    }

    @Override
    public LinkedHashMap<Integer, Integer> getHitsPages() {
        return this.hitsPages;
    }

    @Override
    public String getMarkup() {
        loadPageInfo(); //inits once

        String highLightField = null;

        String highlightQuery = keywordHitQuery;

        if (isRegex) {
            highLightField = LuceneQuery.HIGHLIGHT_FIELD_REGEX;
            //escape special lucene chars if not already escaped (if not a compound query)
            //TODO a better way to mark it a compound highlight query
            final String findSubstr = LuceneQuery.HIGHLIGHT_FIELD_REGEX + ":";
            if (!highlightQuery.contains(findSubstr)) {
                highlightQuery = KeywordSearchUtil.escapeLuceneQuery(highlightQuery);
            }
        } else {
            highLightField = LuceneQuery.HIGHLIGHT_FIELD_LITERAL;
            //escape special lucene chars always for literal queries query
            highlightQuery = KeywordSearchUtil.escapeLuceneQuery(highlightQuery);
        }

        SolrQuery q = new SolrQuery();
        q.setShowDebugInfo(DEBUG); //debug

        String queryStr = null;

        if (isRegex) {
            StringBuilder sb = new StringBuilder();
            sb.append(highLightField).append(":");
            if (group) {
                sb.append("\"");
            }
            sb.append(highlightQuery);
            if (group) {
                sb.append("\"");
            }
            queryStr = sb.toString();
        } else {
            //use default field, simplifies query
            //always force grouping/quotes
            queryStr = KeywordSearchUtil.quoteQuery(highlightQuery);
        }

        q.setQuery(queryStr);

        String contentIdStr = Long.toString(this.objectId);
        if (hasChunks) {
            contentIdStr += "_" + Integer.toString(this.currentPage);
        }

        final String filterQuery = Server.Schema.ID.toString() + ":"
                + KeywordSearchUtil.escapeLuceneQuery(contentIdStr);
        q.addFilterQuery(filterQuery);
        q.addHighlightField(highLightField); //for exact highlighting, try content_ws field (with stored="true" in Solr schema)

        //q.setHighlightSimplePre(HIGHLIGHT_PRE); //original highlighter only
        //q.setHighlightSimplePost(HIGHLIGHT_POST); //original highlighter only
        q.setHighlightFragsize(0); // don't fragment the highlight, works with original highlighter, or needs "single" list builder with FVH

        //tune the highlighter
        q.setParam("hl.useFastVectorHighlighter", "on"); //fast highlighter scales better than standard one NON-NLS
        q.setParam("hl.tag.pre", HIGHLIGHT_PRE); //makes sense for FastVectorHighlighter only NON-NLS
        q.setParam("hl.tag.post", HIGHLIGHT_POST); //makes sense for FastVectorHighlighter only NON-NLS
        q.setParam("hl.fragListBuilder", "single"); //makes sense for FastVectorHighlighter only NON-NLS

        //docs says makes sense for the original Highlighter only, but not really
        q.setParam("hl.maxAnalyzedChars", Server.HL_ANALYZE_CHARS_UNLIMITED); //NON-NLS

        try {
            QueryResponse response = solrServer.query(q, METHOD.POST);
            Map<String, Map<String, List<String>>> responseHighlight = response.getHighlighting();

            Map<String, List<String>> responseHighlightID = responseHighlight.get(contentIdStr);
            if (responseHighlightID == null) {
                return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");

            }
            List<String> contentHighlights = responseHighlightID.get(highLightField);
            if (contentHighlights == null) {
                return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.getMarkup.noMatchMsg");
            } else {
                // extracted content (minus highlight tags) is HTML-escaped
                String highlightedContent = contentHighlights.get(0).trim();
                highlightedContent = insertAnchors(highlightedContent);

                return "<html><pre>" + highlightedContent + "</pre></html>"; //NON-NLS
            }
        } catch (NoOpenCoreException ex) {
            logger.log(Level.WARNING, "Couldn't query markup for page: " + currentPage, ex); //NON-NLS
            return "";
        } catch (KeywordSearchModuleException ex) {
            logger.log(Level.WARNING, "Could not query markup for page: " + currentPage, ex); //NON-NLS
            return "";
        }
    }

    @Override
    public String toString() {
        return NbBundle.getMessage(this.getClass(), "HighlightedMatchesSource.toString");
    }

    @Override
    public boolean isSearchable() {
        return true;
    }

    @Override
    public String getAnchorPrefix() {
        return ANCHOR_PREFIX;
    }

    @Override
    public int getNumberHits() {
        if (!this.hitsPages.containsKey(this.currentPage)) {
            return 0;
        }
        return this.hitsPages.get(this.currentPage);
    }

    private String insertAnchors(String searchableContent) {
        int searchOffset = 0;
        int index = -1;

        StringBuilder buf = new StringBuilder(searchableContent);

        final String searchToken = HIGHLIGHT_PRE;
        final int indexSearchTokLen = searchToken.length();
        final String insertPre = "<a name='" + ANCHOR_PREFIX; //NON-NLS
        final String insertPost = "'></a>"; //NON-NLS
        int count = 0;
        while ((index = buf.indexOf(searchToken, searchOffset)) >= 0) {
            String insertString = insertPre + Integer.toString(count + 1) + insertPost;
            int insertStringLen = insertString.length();
            buf.insert(index, insertString);
            searchOffset = index + indexSearchTokLen + insertStringLen; //next offset past this anchor
            ++count;
        }

        //store total hits for this page, now that we know it
        this.hitsPages.put(this.currentPage, count);
        if (this.currentItem() == 0 && this.hasNextItem()) {
            this.nextItem();
        }

        return buf.toString();
    }

    //dummy instance for Lookup only
    private static TextMarkupLookup instance = null;

    //getter of the singleton dummy instance solely for Lookup purpose
    //this instance does not actually work with Solr
    public static synchronized TextMarkupLookup getDefault() {
        if (instance == null) {
            instance = new HighlightedTextMarkup();
        }
        return instance;
    }

    @Override
    // factory method to create an instance of this object
    public TextMarkupLookup createInstance(long objectId, String keywordHitQuery, boolean isRegex,
            String originalQuery) {
        return new HighlightedTextMarkup(objectId, keywordHitQuery, isRegex, originalQuery);
    }
}