com.yolodata.tbana.cascading.shuttl.ShuttlCsv.java Source code

Java tutorial

Introduction

Here is the source code for com.yolodata.tbana.cascading.shuttl.ShuttlCsv.java

Source

/*
 * Copyright (c) 2013 Yolodata, LLC,  All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.yolodata.tbana.cascading.shuttl;

import cascading.flow.FlowProcess;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.CompositeTap;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import com.yolodata.tbana.hadoop.mapred.splunk.SplunkDataQuery;
import com.yolodata.tbana.hadoop.mapred.shuttl.ShuttlCSVInputFormat;
import com.yolodata.tbana.hadoop.mapred.shuttl.ShuttlInputFormatConstants;
import com.yolodata.tbana.hadoop.mapred.util.ArrayListTextWritable;
import com.yolodata.tbana.hadoop.mapred.util.CSVReader;
import com.yolodata.tbana.util.search.ShuttlCsvFileFinder;
import org.apache.commons.lang.NotImplementedException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;

import java.io.IOException;
import java.io.InputStreamReader;

public class ShuttlCsv extends TextLine {

    private SplunkDataQuery splunkDataQuery;

    public ShuttlCsv() {
        this(new SplunkDataQuery());
    }

    public ShuttlCsv(SplunkDataQuery splunkDataQuery) {
        this(Fields.ALL, splunkDataQuery);
    }

    public ShuttlCsv(Fields fields, SplunkDataQuery splunkDataQuery) {
        super();
        this.splunkDataQuery = splunkDataQuery;
        setSourceFields(fields);
    }

    @Override
    public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
            JobConf conf) {

        conf.set(ShuttlInputFormatConstants.INDEX_LIST, splunkDataQuery.getIndexesString());
        conf.set(ShuttlInputFormatConstants.EARLIEST_TIME, splunkDataQuery.getEarliestTimeString());
        conf.set(ShuttlInputFormatConstants.LATEST_TIME, splunkDataQuery.getLatestTimeString());
        conf.setInputFormat(ShuttlCSVInputFormat.class);
    }

    @Override
    public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
            JobConf conf) {
        throw new NotImplementedException("Writing to Shuttl not implemented");
    }

    @Override
    public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall)
            throws IOException {
        Tuple result = new Tuple();

        Object key = sourceCall.getContext()[0];
        Object value = sourceCall.getContext()[1];

        boolean hasNext = sourceCall.getInput().next(key, value);
        if (!hasNext) {
            return false;
        }

        // Skip nulls
        if (key == null || value == null) {
            return true;
        }

        LongWritable keyWritable = (LongWritable) key;
        ArrayListTextWritable values = (ArrayListTextWritable) value;
        result.add(keyWritable);

        for (Text textValue : values)
            result.add(textValue);

        sourceCall.getIncomingEntry().setTuple(result);

        return true;
    }

    @Override
    public Fields retrieveSourceFields(FlowProcess<JobConf> flowProcess, Tap tap) {
        // no need to open them all
        if (tap instanceof CompositeTap)
            tap = (Tap) ((CompositeTap) tap).getChildTaps().next();

        JobConf conf = flowProcess.getConfigCopy();
        String path = tap.getFullIdentifier(conf);

        try {
            FileSystem fileSystem = FileSystem.get(conf);
            ShuttlCsvFileFinder fileFinder = new ShuttlCsvFileFinder(fileSystem, new Path(path));
            Path pathToOneOfTheBucketCsvs = fileFinder.findSingleFile(splunkDataQuery);

            FSDataInputStream in = fileSystem.open(pathToOneOfTheBucketCsvs);

            CSVReader reader = new CSVReader(new InputStreamReader(in));
            ArrayListTextWritable values = new ArrayListTextWritable();
            if (reader.readLine(values) != 0) {
                // First field is always offset
                Fields fields = new Fields("offset");
                for (Text t : values) {
                    fields = fields.append(new Fields(t.toString()));
                }
                setSourceFields(fields);
            }

        } catch (IOException e) {

        }

        return getSourceFields();
    }

    @Override
    public void sourceCleanup(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) {
        sourceCall.setContext(null);
    }

    @Override
    public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall)
            throws IOException {
        throw new NotImplementedException("Writing to Shuttl not implemented");
    }
}