com.cloudera.cdk.morphline.stdio.AbstractParser.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.morphline.stdio.AbstractParser.java

Source

/*
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.morphline.stdio;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;

import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.MorphlineRuntimeException;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.AbstractCommand;
import com.cloudera.cdk.morphline.base.Fields;
import com.cloudera.cdk.morphline.base.Metrics;
import com.codahale.metrics.Meter;
import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import com.typesafe.config.Config;

/**
 * Base class for convenient implementation of morphline parsers.
 */
public abstract class AbstractParser extends AbstractCommand {

    private final Meter numRecordsMeter;
    private Set<MediaType> supportedMimeTypes = null;

    public static final String SUPPORTED_MIME_TYPES = "supportedMimeTypes";

    protected AbstractParser(CommandBuilder builder, Config config, Command parent, Command child,
            MorphlineContext context) {
        super(builder, config, parent, child, context);
        List<String> mimeTypes = getConfigs().getStringList(config, SUPPORTED_MIME_TYPES, Collections.EMPTY_LIST);
        for (String mimeType : mimeTypes) {
            addSupportedMimeType(mimeType);
        }
        this.numRecordsMeter = getMeter(Metrics.NUM_RECORDS);
    }

    /** Deprecated; will be removed in the next release */
    @Deprecated
    protected AbstractParser(Config config, Command parent, Command child, MorphlineContext context) {
        super(config, parent, child, context);
        List<String> mimeTypes = getConfigs().getStringList(config, SUPPORTED_MIME_TYPES, Collections.EMPTY_LIST);
        for (String mimeType : mimeTypes) {
            addSupportedMimeType(mimeType);
        }
        this.numRecordsMeter = getMeter(Metrics.NUM_RECORDS);
    }

    protected void addSupportedMimeType(String mediaType) {
        if (supportedMimeTypes == null) {
            supportedMimeTypes = new HashSet();
        }
        supportedMimeTypes.add(parseMimeType(mediaType));
    }

    @Override
    protected boolean doProcess(Record record) {
        if (!hasAtLeastOneAttachment(record)) {
            return false;
        }

        // TODO: make field for stream configurable
        String streamMediaType = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE);
        if (!isMimeTypeSupported(streamMediaType, record)) {
            return false;
        }

        InputStream stream = getAttachmentInputStream(record);
        try {
            return doProcess(record, stream);
        } catch (IOException e) {
            throw new MorphlineRuntimeException(e);
        } finally {
            Closeables.closeQuietly(stream);
        }
    }

    protected abstract boolean doProcess(Record record, InputStream stream) throws IOException;

    protected void incrementNumRecords() {
        if (isMeasuringMetrics()) {
            numRecordsMeter.mark();
        }
    }

    private boolean isMimeTypeSupported(String mediaTypeStr, Record record) {
        if (supportedMimeTypes == null) {
            return true;
        }
        if (!hasAtLeastOneMimeType(record)) {
            return false;
        }
        MediaType mediaType = parseMimeType(mediaTypeStr);
        if (supportedMimeTypes.contains(mediaType)) {
            return true; // fast path
        }
        // wildcard matching
        for (MediaType rangePattern : supportedMimeTypes) {
            if (isMimeTypeMatch(mediaType, rangePattern)) {
                return true;
            }
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("No supported MIME type found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr);
        }
        return false;
    }

    private MediaType parseMimeType(String mediaTypeStr) {
        MediaType mediaType = MediaType.parse(mediaTypeStr.trim().toLowerCase(Locale.ROOT));
        return mediaType.getBaseType();
    };

    /** Returns true if mediaType falls withing the given range (pattern), false otherwise */
    private boolean isMimeTypeMatch(MediaType mediaType, MediaType rangePattern) {
        String WILDCARD = "*";
        String rangePatternType = rangePattern.getType();
        String rangePatternSubtype = rangePattern.getSubtype();
        return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType()))
                && (rangePatternSubtype.equals(WILDCARD) || rangePatternSubtype.equals(mediaType.getSubtype()));
    }

    protected Charset detectCharset(Record record, Charset charset) {
        if (charset != null) {
            return charset;
        }
        List charsets = record.get(Fields.ATTACHMENT_CHARSET);
        if (charsets.size() == 0) {
            // TODO try autodetection (AutoDetectReader)
            throw new MorphlineRuntimeException("Missing charset for record: " + record);
        }
        String charsetName = (String) charsets.get(0);
        return Charset.forName(charsetName);
    }

    private boolean hasAtLeastOneAttachment(Record record) {
        if (!record.getFields().containsKey(Fields.ATTACHMENT_BODY)) {
            LOG.debug("Command failed because of missing attachment for record: {}", record);
            return false;
        }
        return true;
    }

    private boolean hasAtLeastOneMimeType(Record record) {
        if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
            LOG.debug("Command failed because of missing MIME type for record: {}", record);
            return false;
        }
        return true;
    }

    private InputStream getAttachmentInputStream(Record record) {
        Object body = record.getFirstValue(Fields.ATTACHMENT_BODY);
        Preconditions.checkNotNull(body);
        if (body instanceof byte[]) {
            return new ByteArrayInputStream((byte[]) body);
        } else {
            return (InputStream) body;
        }
    }

    public static void removeAttachments(Record outputRecord) {
        outputRecord.removeAll(Fields.ATTACHMENT_BODY);
        outputRecord.removeAll(Fields.ATTACHMENT_MIME_TYPE);
        outputRecord.removeAll(Fields.ATTACHMENT_CHARSET);
        outputRecord.removeAll(Fields.ATTACHMENT_NAME);
    }

    int getBufferSize(InputStream stream) {
        if (stream instanceof ByteArrayInputStream) {
            return 1024; // probably a single log line from Flume    
        } else {
            return 8192; // same as default for new BufferedReader()
        }
    }

    //public static XMediaType toGuavaMediaType(TMediaType tika) {
    //return XMediaType.create(tika.getType(), tika.getSubtype()).withParameters(Multimaps.forMap(tika.getParameters()));
    //}
    //
    //public static List<XMediaType> toGuavaMediaType(Iterable<TMediaType> tikaCollection) {
    //List<XMediaType> list = new ArrayList();
    //for (TMediaType tika : tikaCollection) {
    //  list.add(toGuavaMediaType(tika));
    //}
    //return list;
    //}

}