org.apache.hadoop.mapreduce.lib.input.RHadoopInputFormat.java Source code

Introduction

Here is the source code for org.apache.hadoop.mapreduce.lib.input.RHadoopInputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.hadoop.mapreduce.lib.input;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.commons.compress.utils.Charsets;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
//sharva
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * An {@link InputFormat} for plain text files. Files are broken into lines.
 * Either linefeed or carriage-return are used to signal end of line. Keys are
 * the position in the file, and values are the line of text..
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class RHadoopInputFormat extends FileInputFormat<LongWritable, Text> {
    /**
     * Generate the list of files and make them into FileSplits.
     * 
     * @param job
     *            the job context
     * @throws IOException
     */
    @Override
    public List<InputSplit> getSplits(JobContext job) throws IOException {

        Path path = new Path("/RHadoop/indexLog");
        FileSystem fs = path.getFileSystem(job.getConfiguration());

        HashSet<InputSplit> splits = chunkIndex(fs, path);

        return new ArrayList<InputSplit>(splits);
    }

    // just copied from textinputformat, assuming the key value pairs are
    // separated by the same delimiter
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {

        String delimiter = context.getConfiguration().get("textinputformat.record.delimiter");
        byte[] recordDelimiterBytes = null;
        /* sharvanath : solve this issue */
        // to be implemented
        if (null != delimiter)
            recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
        return new LineRecordReader(recordDelimiterBytes);

    }

    public boolean isValidKeyRange(String lower, String upper) {
        return true;
    }

    public boolean isValidKey(String key) {
        return isValidKeyRange(key, key);
    }

    @SuppressWarnings("deprecation")
    public HashSet<InputSplit> chunkIndex(FileSystem fs, Path filenamePath) {

        HashSet<InputSplit> splitSet = new HashSet<InputSplit>();
        try {
            // reading
            FSDataInputStream in = fs.open(filenamePath);
            String line;
            Path file = null;
            long start = 0;
            long length = 0;
            Log LOG = LogFactory.getLog(FileInputFormat.class);

            line = in.readLine();
            LOG.fatal("RHadoop : opened index file, first line = " + line);
            while (line != null) {
                String[] splits = line.split(" ");
                if (splits.length < 5)
                    throw new Exception("Less Than 5 splits");

                String lower = splits[0], upper = splits[1];
                file = new Path(splits[2]);
                start = Long.parseLong(splits[3]);
                length = Long.parseLong(splits[4]);

                splits = line.split("\\[");
                if (splits.length != 2)
                    throw new Exception("[ split not 2.");

                String hostStr = splits[1];
                hostStr = hostStr.substring(0, hostStr.length() - 1);
                String[] hostSplits = hostStr.split(",");
                for (int i = 0; i < hostSplits.length; i++)
                    hostSplits[i] = hostSplits[i].trim();

                if (isValidKeyRange(lower, upper))
                    splitSet.add(new FileSplit(file, start, length, hostSplits));
                line = in.readLine();
            }
            in.close();
        } catch (Exception e) {
            e.printStackTrace();
        }

        return splitSet;
    }

}