org.apache.nifi.processors.standard.CalculateRecordStats.java Source code

Introduction

Here is the source code for org.apache.nifi.processors.standard.CalculateRecordStats.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nifi.processors.standard;

import com.google.common.collect.Lists;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.record.path.FieldValue;
import org.apache.nifi.record.path.RecordPath;
import org.apache.nifi.record.path.RecordPathResult;
import org.apache.nifi.record.path.util.RecordPathCache;
import org.apache.nifi.serialization.RecordReader;
import org.apache.nifi.serialization.RecordReaderFactory;
import org.apache.nifi.serialization.record.Record;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

@Tags({ "record", "stats", "metrics" })
@CapabilityDescription("A processor that can count the number of items in a record set, as well as provide counts based on "
        + "user-defined criteria on subsets of the record set.")
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@WritesAttributes({
        @WritesAttribute(attribute = CalculateRecordStats.RECORD_COUNT_ATTR, description = "A count of the records in the record set in the flowfile."),
        @WritesAttribute(attribute = "recordStats.<User Defined Property Name>.count", description = "A count of the records that contain a value for the user defined property."),
        @WritesAttribute(attribute = "recordStats.<User Defined Property Name>.<value>.count", description = "Each value discovered for the user defined property will have its own count attribute. "
                + "Total number of top N value counts to be added is defined by the limit configuration.") })
public class CalculateRecordStats extends AbstractProcessor {
    static final String RECORD_COUNT_ATTR = "record.count";

    static final PropertyDescriptor RECORD_READER = new PropertyDescriptor.Builder().name("record-stats-reader")
            .displayName("Record Reader").description("A record reader to use for reading the records.")
            .identifiesControllerService(RecordReaderFactory.class).required(true).build();

    static final PropertyDescriptor LIMIT = new PropertyDescriptor.Builder().name("record-stats-limit").description(
            "Limit the number of individual stats that are returned for each record path to the top N results.")
            .required(true).defaultValue("10").addValidator(StandardValidators.INTEGER_VALIDATOR)
            .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES).build();

    static final Relationship REL_SUCCESS = new Relationship.Builder().name("success")
            .description("If a flowfile is successfully processed, it goes here.").build();
    static final Relationship REL_FAILURE = new Relationship.Builder().name("failure")
            .description("If a flowfile fails to be processed, it goes here.").build();

    private RecordPathCache cache;

    static final Set RELATIONSHIPS;
    static final List<PropertyDescriptor> PROPERTIES;

    static {
        Set _rels = new HashSet();
        _rels.add(REL_SUCCESS);
        _rels.add(REL_FAILURE);
        RELATIONSHIPS = Collections.unmodifiableSet(_rels);
        List<PropertyDescriptor> _temp = new ArrayList<>();
        _temp.add(RECORD_READER);
        _temp.add(LIMIT);
        PROPERTIES = Collections.unmodifiableList(_temp);
    }

    protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) {
        return new PropertyDescriptor.Builder().name(propertyDescriptorName).displayName(propertyDescriptorName)
                .dynamic(true).addValidator(StandardValidators.NON_BLANK_VALIDATOR)
                .expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES).build();
    }

    protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return PROPERTIES;
    }

    @OnScheduled
    public void onEnabled(ProcessContext context) {
        cache = new RecordPathCache(25);
    }

    @Override
    public Set<Relationship> getRelationships() {
        return RELATIONSHIPS;
    }

    @Override
    public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
        FlowFile input = session.get();
        if (input == null) {
            return;
        }

        try {
            Map<String, RecordPath> paths = getRecordPaths(context, input);
            Map<String, String> stats = getStats(input, paths, context, session);

            input = session.putAllAttributes(input, stats);

            session.transfer(input, REL_SUCCESS);

        } catch (Exception ex) {
            getLogger().error("Error processing stats.", ex);
            session.transfer(input, REL_FAILURE);
        }

    }

    protected Map<String, RecordPath> getRecordPaths(ProcessContext context, FlowFile flowFile) {
        return context.getProperties().keySet().stream().filter(p -> p.isDynamic())
                .collect(Collectors.toMap(e -> e.getName(), e -> {
                    String val = context.getProperty(e).evaluateAttributeExpressions(flowFile).getValue();
                    return cache.getCompiled(val);
                }));
    }

    protected Map<String, String> getStats(FlowFile flowFile, Map<String, RecordPath> paths, ProcessContext context,
            ProcessSession session) {
        try (InputStream is = session.read(flowFile)) {
            RecordReaderFactory factory = context.getProperty(RECORD_READER)
                    .asControllerService(RecordReaderFactory.class);
            final Integer limit = context.getProperty(LIMIT).evaluateAttributeExpressions(flowFile).asInteger();
            RecordReader reader = factory.createRecordReader(flowFile, is, getLogger());

            Map<String, Integer> retVal = new HashMap<>();
            Record record;

            int recordCount = 0;
            List<String> baseKeys = new ArrayList<>();
            while ((record = reader.nextRecord()) != null) {
                for (Map.Entry<String, RecordPath> entry : paths.entrySet()) {
                    RecordPathResult result = entry.getValue().evaluate(record);
                    Optional<FieldValue> value = result.getSelectedFields().findFirst();
                    if (value.isPresent() && value.get().getValue() != null) {
                        String approxValue = value.get().getValue().toString();
                        String baseKey = String.format("recordStats.%s", entry.getKey());
                        String key = String.format("%s.%s", baseKey, approxValue);
                        Integer stat = retVal.containsKey(key) ? retVal.get(key) : 0;
                        Integer baseStat = retVal.getOrDefault(baseKey, 0);
                        stat++;
                        baseStat++;

                        retVal.put(key, stat);
                        retVal.put(baseKey, baseStat);

                        if (!baseKeys.contains(baseKey)) {
                            baseKeys.add(baseKey);
                        }
                    }
                }

                recordCount++;
            }

            retVal = filterBySize(retVal, limit, baseKeys);

            retVal.put(RECORD_COUNT_ATTR, recordCount);

            return retVal.entrySet().stream()
                    .collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue().toString()));
        } catch (Exception e) {
            getLogger().error("Could not read flowfile", e);
            throw new ProcessException(e);
        }
    }

    protected Map filterBySize(Map<String, Integer> values, Integer limit, List<String> baseKeys) {
        Map<String, Integer> toFilter = values.entrySet().stream().filter(e -> !baseKeys.contains(e.getKey()))
                .collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue()));
        Map<String, Integer> retVal = values.entrySet().stream().filter((e -> baseKeys.contains(e.getKey())))
                .collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue()));

        List<Map.Entry<String, Integer>> _flat = new ArrayList<>(toFilter.entrySet());
        _flat.sort(Map.Entry.comparingByValue());
        _flat = Lists.reverse(_flat);
        for (int index = 0; index < _flat.size() && index < limit; index++) {
            retVal.put(_flat.get(index).getKey(), _flat.get(index).getValue());
        }

        return retVal;
    }
}