io.woolford.processors.nifibenford.BenfordsLaw.java Source code

Java tutorial

Introduction

Here is the source code for io.woolford.processors.nifibenford.BenfordsLaw.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.woolford.processors.nifibenford;

import org.apache.commons.math3.stat.inference.ChiSquareTest;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.*;
import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.SeeAlso;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

@Tags({ "Benfords Law", "fraud detection" })
@CapabilityDescription("Takes an input stream of text, typically the output from a document, and uses Benford's Law to classify the document as conforming or non-conforming.")
@SeeAlso({})
@ReadsAttributes({ @ReadsAttribute(attribute = "", description = "") })
@WritesAttributes({ @WritesAttribute(attribute = "", description = "") })
public class BenfordsLaw extends AbstractProcessor {

    public static final PropertyDescriptor ALPHA = new PropertyDescriptor.Builder().name("alpha").description(
            "This is the significance level at which documents will be classified as conforming or non-conforming.")
            .required(true).defaultValue("0.05").addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();

    public static final PropertyDescriptor MIN_SAMPLE = new PropertyDescriptor.Builder().name("min-sample")
            .description("This is the minimum number of numerical values to perform a Chi-squared test.")
            .required(true).defaultValue("5").addValidator(StandardValidators.NON_EMPTY_VALIDATOR).build();

    public static final Relationship NON_CONFORMING = new Relationship.Builder().name("NON_CONFORMING")
            .description("Non-conforming relationship").build();

    public static final Relationship CONFORMING = new Relationship.Builder().name("CONFORMING")
            .description("Conforming relationship").build();

    public static final Relationship INSUFFICIENT_SAMPLE = new Relationship.Builder().name("INSUFFICIENT_SAMPLE")
            .description("Insufficient numerical values to run a Chi-squared test").build();

    private List<PropertyDescriptor> descriptors;

    private Set<Relationship> relationships;

    @Override
    protected void init(final ProcessorInitializationContext context) {
        final List<PropertyDescriptor> descriptors = new ArrayList<PropertyDescriptor>();
        descriptors.add(ALPHA);
        descriptors.add(MIN_SAMPLE);
        this.descriptors = Collections.unmodifiableList(descriptors);

        final Set<Relationship> relationships = new HashSet<Relationship>();
        relationships.add(NON_CONFORMING);
        relationships.add(CONFORMING);
        relationships.add(INSUFFICIENT_SAMPLE);
        this.relationships = Collections.unmodifiableSet(relationships);
    }

    @Override
    public Set<Relationship> getRelationships() {
        return this.relationships;
    }

    @Override
    public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
        return descriptors;
    }

    @OnScheduled
    public void onScheduled(final ProcessContext context) {

    }

    @Override
    public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
        FlowFile flowFile = session.get();
        if (flowFile == null) {
            return;
        }

        InputStream inputStream = session.read(flowFile);
        String input = new BufferedReader(new InputStreamReader(inputStream)).lines()
                .collect(Collectors.joining("\n"));

        // TODO: since the values returned by Benford's array don't ever change, these could be hard-coded rather than calling a function each time.
        double[] benfordsArray = getBenfordsArray();
        long[] firstDigitArray = getFirstDigitArray(input);

        long sampleSize = LongStream.of(firstDigitArray).sum();

        ChiSquareTest chiSquareTest = new ChiSquareTest();
        Boolean suspect = chiSquareTest.chiSquareTest(benfordsArray, firstDigitArray,
                context.getProperty(ALPHA).asDouble());

        //TODO: don't perform the chi-squared test if the sample is too small
        if (sampleSize < context.getProperty(MIN_SAMPLE).asLong()) {
            session.transfer(flowFile, INSUFFICIENT_SAMPLE);
        } else if (suspect) {
            session.transfer(flowFile, NON_CONFORMING);
        } else {
            session.transfer(flowFile, CONFORMING);
        }

    }

    private static double[] getBenfordsArray() {
        // this is the expected distribtion of first digits for Benford's Law
        double[] benfordsArray = new double[9];
        for (int i = 1; i < 10; i++) {
            double benfordValue = 100 * Math.log10(1 + 1.0 / i);
            benfordsArray[i - 1] = benfordValue;
        }
        return benfordsArray;
    }

    private static long[] getFirstDigitArray(String input) {
        String[] inputArray = input.split("\\s+");
        long[] firstDigitArray = new long[9];
        for (String elem : inputArray) {
            String firstChar = String.valueOf(elem.charAt(0));
            boolean isDigit = firstChar.matches("[1-9]{1}");
            if (isDigit) {
                firstDigitArray[Integer.parseInt(firstChar) - 1] = firstDigitArray[Integer.parseInt(firstChar) - 1]
                        + 1;
            }
        }
        return firstDigitArray;
    }

}