org.datacleaner.spark.functions.RowProcessingFunction.java Source code

Java tutorial

Introduction

Here is the source code for org.datacleaner.spark.functions.RowProcessingFunction.java

Source

/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark.functions;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;

import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.datacleaner.api.AnalyzerResult;
import org.datacleaner.api.AnalyzerResultFuture;
import org.datacleaner.api.HasAnalyzerResult;
import org.datacleaner.api.InputRow;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.runner.ActiveOutputDataStream;
import org.datacleaner.job.runner.ConsumeRowHandler;
import org.datacleaner.job.runner.RowProcessingConsumer;
import org.datacleaner.lifecycle.LifeCycleHelper;
import org.datacleaner.spark.NamedAnalyzerResult;
import org.datacleaner.spark.SparkJobContext;

import scala.Tuple2;

public final class RowProcessingFunction
        implements PairFlatMapFunction<Iterator<InputRow>, String, NamedAnalyzerResult> {

    private static final long serialVersionUID = 1L;
    private final SparkJobContext _sparkJobContext;

    public RowProcessingFunction(final SparkJobContext sparkJobContext) {
        _sparkJobContext = sparkJobContext;
    }

    @Override
    public Iterable<Tuple2<String, NamedAnalyzerResult>> call(Iterator<InputRow> inputRowIterator)
            throws Exception {
        final DataCleanerConfiguration configuration = _sparkJobContext.getConfiguration();
        final AnalysisJob analysisJob = _sparkJobContext.getAnalysisJob();

        // set up processing stream (this also initializes the components)
        final ConsumeRowHandler consumeRowHandler;
        {
            final ConsumeRowHandler.Configuration handlerConfiguration = new ConsumeRowHandler.Configuration();
            handlerConfiguration.includeAnalyzers = true;
            handlerConfiguration.includeNonDistributedTasks = false;
            consumeRowHandler = new ConsumeRowHandler(analysisJob, configuration, handlerConfiguration);
        }

        // fire row processing on each row
        while (inputRowIterator.hasNext()) {
            final InputRow inputRow = inputRowIterator.next();
            consumeRowHandler.consumeRow(inputRow);
        }

        // collect results
        final List<Tuple2<String, NamedAnalyzerResult>> analyzerResults = getAnalyzerResults(
                consumeRowHandler.getConsumers());

        // await any future results
        for (ListIterator<Tuple2<String, NamedAnalyzerResult>> it = analyzerResults.listIterator(); it.hasNext();) {
            final Tuple2<String, NamedAnalyzerResult> tuple = it.next();
            final NamedAnalyzerResult namedAnalyzerResult = tuple._2;
            final AnalyzerResult analyzerResult = namedAnalyzerResult.getAnalyzerResult();
            if (analyzerResult instanceof AnalyzerResultFuture) {
                final AnalyzerResult awaitedResult = ((AnalyzerResultFuture<?>) analyzerResult).get();
                final NamedAnalyzerResult awaitedResultTuple = new NamedAnalyzerResult(
                        namedAnalyzerResult.getName(), awaitedResult);
                it.set(new Tuple2<>(tuple._1, awaitedResultTuple));
            }
        }

        // close components
        final LifeCycleHelper lifeCycleHelper = new LifeCycleHelper(configuration, analysisJob, false);
        for (RowProcessingConsumer consumer : consumeRowHandler.getConsumers()) {
            lifeCycleHelper.close(consumer.getComponentJob().getDescriptor(), consumer.getComponent(), true);
        }

        return analyzerResults;
    }

    private List<Tuple2<String, NamedAnalyzerResult>> getAnalyzerResults(
            Collection<RowProcessingConsumer> rowProcessingConsumers) {
        List<Tuple2<String, NamedAnalyzerResult>> analyzerResults = new ArrayList<>();

        for (RowProcessingConsumer consumer : rowProcessingConsumers) {
            if (consumer.isResultProducer()) {
                final HasAnalyzerResult<?> resultProducer = (HasAnalyzerResult<?>) consumer.getComponent();
                final AnalyzerResult analyzerResult = resultProducer.getResult();
                final String key = _sparkJobContext.getComponentKey(consumer.getComponentJob());
                final NamedAnalyzerResult namedAnalyzerResult = new NamedAnalyzerResult(key, analyzerResult);
                final Tuple2<String, NamedAnalyzerResult> tuple = new Tuple2<>(key, namedAnalyzerResult);
                analyzerResults.add(tuple);
            }

            for (ActiveOutputDataStream activeOutputDataStream : consumer.getActiveOutputDataStreams()) {
                List<RowProcessingConsumer> outputDataStreamConsumers = activeOutputDataStream.getPublisher()
                        .getConsumers();
                List<Tuple2<String, NamedAnalyzerResult>> outputDataStreamsAnalyzerResults = getAnalyzerResults(
                        outputDataStreamConsumers);
                analyzerResults.addAll(outputDataStreamsAnalyzerResults);
            }
        }
        return analyzerResults;
    }
}