org.apache.nifi.provenance.lucene.LuceneUtil.java Source code

Introduction

Here is the source code for org.apache.nifi.provenance.lucene.LuceneUtil.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nifi.provenance.lucene;

import java.io.File;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.nifi.processor.DataUnit;
import org.apache.nifi.provenance.SearchableFields;
import org.apache.nifi.provenance.search.SearchTerm;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Version;

public class LuceneUtil {

    public static final Version LUCENE_VERSION = Version.LATEST;

    public static String substringBefore(final String value, final String searchValue) {
        final int index = value.indexOf(searchValue);
        return (index < 0) ? value : value.substring(0, index);
    }

    public static String substringAfter(final String value, final String searchValue) {
        final int index = value.indexOf(searchValue);
        return (index < 0) ? value : (index > value.length() - 2) ? "" : value.substring(index + 1);
    }

    public static String substringBeforeLast(final String value, final String searchValue) {
        final int index = value.lastIndexOf(searchValue);
        return (index < 0) ? value : value.substring(0, index);
    }

    public static String substringAfterLast(final String value, final String searchValue) {
        final int index = value.lastIndexOf(searchValue);
        return (index < 0 || index >= value.length()) ? value : value.substring(index + 1);
    }

    public static File getProvenanceLogFile(final String baseName, final Collection<Path> allProvenanceLogs) {
        final List<File> logFiles = getProvenanceLogFiles(baseName, allProvenanceLogs);
        if (logFiles.size() != 1) {
            return null;
        }

        return logFiles.get(0);
    }

    public static List<File> getProvenanceLogFiles(final String baseName,
            final Collection<Path> allProvenanceLogs) {
        final List<File> matchingFiles = new ArrayList<>();

        final String searchString = baseName + ".";
        for (final Path path : allProvenanceLogs) {
            if (path.toFile().getName().startsWith(searchString)) {
                final File file = path.toFile();
                if (file.exists()) {
                    matchingFiles.add(file);
                } else {
                    final File dir = file.getParentFile();
                    final File gzFile = new File(dir, file.getName() + ".gz");
                    if (gzFile.exists()) {
                        matchingFiles.add(gzFile);
                    }
                }
            }
        }

        return matchingFiles;
    }

    public static org.apache.lucene.search.Query convertQuery(final org.apache.nifi.provenance.search.Query query) {
        if (query.getStartDate() == null && query.getEndDate() == null && query.getSearchTerms().isEmpty()) {
            return new MatchAllDocsQuery();
        }

        final BooleanQuery luceneQuery = new BooleanQuery();
        for (final SearchTerm searchTerm : query.getSearchTerms()) {
            final String searchValue = searchTerm.getValue();
            if (searchValue == null) {
                throw new IllegalArgumentException("Empty search value not allowed (for term '"
                        + searchTerm.getSearchableField().getFriendlyName() + "')");
            }

            if (searchValue.contains("*") || searchValue.contains("?")) {
                luceneQuery.add(new BooleanClause(
                        new WildcardQuery(new Term(searchTerm.getSearchableField().getSearchableFieldName(),
                                searchTerm.getValue().toLowerCase())),
                        Occur.MUST));
            } else {
                luceneQuery.add(new BooleanClause(
                        new TermQuery(new Term(searchTerm.getSearchableField().getSearchableFieldName(),
                                searchTerm.getValue().toLowerCase())),
                        Occur.MUST));
            }
        }

        final Long minBytes = query.getMinFileSize() == null ? null
                : DataUnit.parseDataSize(query.getMinFileSize(), DataUnit.B).longValue();
        final Long maxBytes = query.getMaxFileSize() == null ? null
                : DataUnit.parseDataSize(query.getMaxFileSize(), DataUnit.B).longValue();
        if (minBytes != null || maxBytes != null) {
            luceneQuery.add(NumericRangeQuery.newLongRange(SearchableFields.FileSize.getSearchableFieldName(),
                    minBytes, maxBytes, true, true), Occur.MUST);
        }

        final Long minDateTime = query.getStartDate() == null ? null : query.getStartDate().getTime();
        final Long maxDateTime = query.getEndDate() == null ? null : query.getEndDate().getTime();
        if (maxDateTime != null || minDateTime != null) {
            luceneQuery.add(NumericRangeQuery.newLongRange(SearchableFields.EventTime.getSearchableFieldName(),
                    minDateTime, maxDateTime, true, true), Occur.MUST);
        }

        return luceneQuery;
    }

    /**
     * Will sort documents by filename and then file offset so that we can
     * retrieve the records efficiently
     *
     * @param documents
     *            list of {@link Document}s
     */
    public static void sortDocsForRetrieval(final List<Document> documents) {
        Collections.sort(documents, new Comparator<Document>() {
            @Override
            public int compare(final Document o1, final Document o2) {
                final String filename1 = o1.get(FieldNames.STORAGE_FILENAME);
                final String filename2 = o2.get(FieldNames.STORAGE_FILENAME);

                final int filenameComp = filename1.compareTo(filename2);
                if (filenameComp != 0) {
                    return filenameComp;
                }

                final IndexableField fileOffset1 = o1.getField(FieldNames.BLOCK_INDEX);
                final IndexableField fileOffset2 = o1.getField(FieldNames.BLOCK_INDEX);
                if (fileOffset1 != null && fileOffset2 != null) {
                    final int blockIndexResult = Long.compare(fileOffset1.numericValue().longValue(),
                            fileOffset2.numericValue().longValue());
                    if (blockIndexResult != 0) {
                        return blockIndexResult;
                    }

                    final long eventId1 = o1.getField(SearchableFields.Identifier.getSearchableFieldName())
                            .numericValue().longValue();
                    final long eventId2 = o2.getField(SearchableFields.Identifier.getSearchableFieldName())
                            .numericValue().longValue();
                    return Long.compare(eventId1, eventId2);
                }

                final long offset1 = o1.getField(FieldNames.STORAGE_FILE_OFFSET).numericValue().longValue();
                final long offset2 = o2.getField(FieldNames.STORAGE_FILE_OFFSET).numericValue().longValue();
                return Long.compare(offset1, offset2);
            }
        });
    }

    /**
     * Will group documents based on the {@link FieldNames#STORAGE_FILENAME}.
     *
     * @param documents
     *            list of {@link Document}s which will be sorted via
     *            {@link #sortDocsForRetrieval(List)} for more efficient record
     *            retrieval.
     * @return a {@link Map} of document groups with
     *         {@link FieldNames#STORAGE_FILENAME} as key and {@link List} of
     *         {@link Document}s as value.
     */
    public static Map<String, List<Document>> groupDocsByStorageFileName(final List<Document> documents) {
        Map<String, List<Document>> documentGroups = new HashMap<>();
        for (Document document : documents) {
            String fileName = document.get(FieldNames.STORAGE_FILENAME);
            if (!documentGroups.containsKey(fileName)) {
                documentGroups.put(fileName, new ArrayList<Document>());
            }
            documentGroups.get(fileName).add(document);
        }
        for (List<Document> groupedDocuments : documentGroups.values()) {
            sortDocsForRetrieval(groupedDocuments);
        }
        return documentGroups;
    }

    /**
     * Truncate a single field so that it does not exceed Lucene's byte size limit on indexed terms.
     *
     * @param field the string to be indexed
     * @return a string that can be indexed which is within Lucene's byte size limit, or null if anything goes wrong
     */
    public static String truncateIndexField(String field) {
        if (field == null) {
            return field;
        }

        Charset charset = Charset.defaultCharset();
        byte[] bytes = field.getBytes(charset);
        if (bytes.length <= IndexWriter.MAX_TERM_LENGTH) {
            return field;
        }

        // chop the field to maximum allowed byte length
        ByteBuffer bbuf = ByteBuffer.wrap(bytes, 0, IndexWriter.MAX_TERM_LENGTH);

        try {
            // decode the chopped byte buffer back into original charset
            CharsetDecoder decoder = charset.newDecoder();
            decoder.onMalformedInput(CodingErrorAction.IGNORE);
            decoder.reset();
            CharBuffer cbuf = decoder.decode(bbuf);
            return cbuf.toString();
        } catch (CharacterCodingException shouldNotHappen) {
        }

        // if we get here, something bad has happened
        return null;
    }
}