com.cloudera.oryx.kmeans.computation.local.Summarize.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.oryx.kmeans.computation.local.Summarize.java

Source

/*
 * Copyright (c) 2013, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */

package com.cloudera.oryx.kmeans.computation.local;

import com.cloudera.oryx.common.io.DelimitedDataUtils;
import com.cloudera.oryx.common.io.IOUtils;
import com.cloudera.oryx.common.iterator.FileLineIterable;
import com.cloudera.oryx.common.settings.ConfigUtils;
import com.cloudera.oryx.common.settings.InboundSettings;
import com.cloudera.oryx.computation.common.summary.InternalStats;
import com.cloudera.oryx.computation.common.summary.Summary;
import com.cloudera.oryx.computation.common.summary.SummaryStats;
import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.Callable;

public final class Summarize implements Callable<Summary> {

    private static final Logger log = LoggerFactory.getLogger(Summarize.class);

    private final File inputDir;

    public Summarize(File inputDir) {
        this.inputDir = inputDir;
    }

    @Override
    public Summary call() throws IOException {
        File[] inputFiles = inputDir.listFiles(IOUtils.CSV_COMPRESSED_FILTER);
        if (inputFiles == null || inputFiles.length == 0) {
            log.warn("No .csv or .gz input files found in input directory");
            return null;
        }

        InboundSettings inboundSettings = InboundSettings.create(ConfigUtils.getDefaultConfig());
        int numFeatures = inboundSettings.getColumnNames().size();
        List<InternalStats> internalStats = Lists.newArrayListWithExpectedSize(numFeatures);
        for (int col = 0; col < numFeatures; col++) {
            if (inboundSettings.isCategorical(col) || inboundSettings.isNumeric(col)) {
                internalStats.add(new InternalStats());
            } else {
                internalStats.add(null);
            }
        }
        int totalRecords = 0;

        for (File inputFile : inputFiles) {
            log.info("Summarizing input from {}", inputFile.getName());
            for (String line : new FileLineIterable(inputFile)) {
                if (line.isEmpty()) {
                    continue;
                }
                totalRecords++;
                String[] tokens = DelimitedDataUtils.decode(line);
                for (int col = 0; col < numFeatures; col++) {
                    if (!inboundSettings.isIgnored(col)) {
                        if (inboundSettings.isCategorical(col)) {
                            internalStats.get(col).addCategorical(tokens[col]);
                        } else if (inboundSettings.isNumeric(col)) {
                            internalStats.get(col).addNumeric(Double.valueOf(tokens[col]));
                        }
                    }
                }
            }
        }

        List<SummaryStats> stats = Lists.newArrayListWithExpectedSize(numFeatures);
        for (int col = 0; col < numFeatures; col++) {
            InternalStats internal = internalStats.get(col);
            if (internal != null) {
                stats.add(internal.toSummaryStats(inboundSettings.getColumnNames().get(col), totalRecords));
            } else {
                stats.add(null);
            }
        }

        return new Summary(totalRecords, numFeatures, stats);
    }
}