cascading.flow.hadoop.util.HadoopMRUtil.java Source code

Introduction

Here is the source code for cascading.flow.hadoop.util.HadoopMRUtil.java
Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.flow.hadoop.util;

import java.io.IOException;
import java.net.URI;

import cascading.flow.FlowException;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.scheme.hadoop.TextLine;
import cascading.tap.SinkMode;
import cascading.tap.hadoop.Hfs;
import cascading.tap.hadoop.Lfs;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterator;
import cascading.util.Util;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 */
public class HadoopMRUtil {
    private static final Logger LOG = LoggerFactory.getLogger(HadoopMRUtil.class);

    public static String writeStateToDistCache(JobConf conf, String id, String kind, String stepState) {
        if (Util.isEmpty(stepState))
            return null;

        LOG.info("writing step state to dist cache, too large for job conf, size: {}", stepState.length());

        String statePath = Hfs.getTempPath(conf) + "/" + kind + "-state-" + id;

        Hfs temp = new Hfs(new TextLine(), statePath, SinkMode.REPLACE);

        try {
            TupleEntryCollector writer = temp.openForWrite(new HadoopFlowProcess(conf));

            writer.add(new Tuple(stepState));

            writer.close();
        } catch (IOException exception) {
            throw new FlowException("unable to write step state to Hadoop FS: " + temp.getIdentifier());
        }

        URI uri = new Path(statePath).toUri();
        DistributedCache.addCacheFile(uri, conf);

        LOG.info("using step state path: {}", uri);

        return statePath;
    }

    public static String readStateFromDistCache(JobConf jobConf, String id, String kind) throws IOException {
        Path[] files = DistributedCache.getLocalCacheFiles(jobConf);

        Path stepStatePath = null;

        for (Path file : files) {
            if (!file.toString().contains(kind + "-state-" + id))
                continue;

            stepStatePath = file;
            break;
        }

        if (stepStatePath == null)
            throw new FlowException("unable to find step state from distributed cache");

        LOG.info("reading step state from local path: {}", stepStatePath);

        Hfs temp = new Lfs(new TextLine(new Fields("line")), stepStatePath.toString());

        TupleEntryIterator reader = null;

        try {
            reader = temp.openForRead(new HadoopFlowProcess(jobConf));

            if (!reader.hasNext())
                throw new FlowException("step state path is empty: " + temp.getIdentifier());

            return reader.next().getString(0);
        } catch (IOException exception) {
            throw new FlowException("unable to find state path: " + temp.getIdentifier(), exception);
        } finally {
            try {
                if (reader != null)
                    reader.close();
            } catch (IOException exception) {
                LOG.warn("error closing state path reader", exception);
            }
        }
    }
}