com.cloudera.cdk.tools.CombinedLogFormatConverter.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.tools.CombinedLogFormatConverter.java

Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.tools;

import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetRepository;
import com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository;
import com.google.common.io.Resources;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.PCollection;
import org.apache.crunch.Target;
import org.apache.crunch.io.avro.AvroFileTarget;
import org.apache.crunch.types.avro.AvroType;
import org.apache.crunch.types.avro.Avros;
import org.apache.crunch.util.CrunchTool;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;

/**
 * A tool for converting files in
 * <a href="http://en.wikipedia.org/wiki/Common_Log_Format">Combined Log Format</a> to a
 * {@link com.cloudera.cdk.data.Dataset}.
 */
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "SE_NO_SERIALVERSIONID", justification = "Serialization not needed between different versions of this class")
public class CombinedLogFormatConverter extends CrunchTool {

    private static final String LOG_PATTERN = "^(\\S+) (\\S+) (\\S+) \\["
            + "([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+|-) \"([^\"]*)\" \"([^\"]+)\"";

    @Override
    public int run(String... args) throws Exception {
        if (args.length != 3) {
            System.err.println("Usage: " + CombinedLogFormatConverter.class.getSimpleName()
                    + " <input> <dataset_root> <dataset name>");
            return 1;
        }
        String input = args[0];
        String datasetRoot = args[1];
        String datasetName = args[2];

        Schema schema = new Schema.Parser().parse(Resources.getResource("combined_log_format.avsc").openStream());

        // Create the dataset
        Path root = new Path(datasetRoot);
        Configuration conf = new Configuration();
        FileSystem fs = root.getFileSystem(conf);

        DatasetRepository repo = new FileSystemDatasetRepository.Builder().fileSystem(fs).rootDirectory(root)
                .build();
        DatasetDescriptor datasetDescriptor = new DatasetDescriptor.Builder().schema(schema).build();

        repo.create(datasetName, datasetDescriptor);

        // Run the job
        final String schemaString = schema.toString();
        AvroType<GenericData.Record> outputType = Avros.generics(schema);
        PCollection<String> lines = readTextFile(input);
        PCollection<GenericData.Record> records = lines.parallelDo(new ConvertFn(schemaString), outputType);
        getPipeline().write(records, new AvroFileTarget(new Path(root, datasetName)), Target.WriteMode.APPEND);
        run();
        return 0;
    }

    private String asString(String s) {
        if ("-".equals(s) || "\"-\"".equals(s)) {
            return null;
        }
        return s;
    }

    private Integer asInt(String s) {
        if ("-".equals(s)) {
            return null;
        }
        return Integer.parseInt(s);
    }

    public static void main(String... args) throws Exception {
        int rc = ToolRunner.run(new CombinedLogFormatConverter(), args);
        System.exit(rc);
    }

    @edu.umd.cs.findbugs.annotations.SuppressWarnings({ "SE_INNER_CLASS", "SE_TRANSIENT_FIELD_NOT_RESTORED" })
    private class ConvertFn extends DoFn<String, GenericData.Record> {
        private final String schemaString;
        transient Pattern pattern;
        transient GenericRecordBuilder recBuilder;

        public ConvertFn(String schemaString) {
            this.schemaString = schemaString;
        }

        public void initialize() {
            pattern = Pattern.compile(LOG_PATTERN);
            recBuilder = new GenericRecordBuilder(new Schema.Parser().parse(schemaString));
        }

        @Override
        public void process(String line, Emitter<GenericData.Record> emitter) {
            Matcher matcher = pattern.matcher(line);
            if (matcher.matches()) {
                // parse line into components
                recBuilder.set("host", asString(matcher.group(1)));
                recBuilder.set("rfc931_identity", asString(matcher.group(2)));
                recBuilder.set("username", asString(matcher.group(3)));
                recBuilder.set("datetime", asString(matcher.group(4)));
                recBuilder.set("request", asString(matcher.group(5)));
                recBuilder.set("http_status_code", asInt(matcher.group(6)));
                recBuilder.set("response_size", asInt(matcher.group(7)));
                recBuilder.set("referrer", asString(matcher.group(8)));
                recBuilder.set("user_agent", asString(matcher.group(9)));
                emitter.emit(recBuilder.build());
            } else {
                System.err.println("No match: " + line);
            }

        }
    }
}