it.crs4.seal.tsv_sort.TsvInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.tsv_sort.TsvInputFormat.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// based on TeraSort from the Hadoop examples

package it.crs4.seal.tsv_sort;

import it.crs4.seal.common.CutText;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.IndexedSortable;
import org.apache.hadoop.util.QuickSort;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * An input format that reads the configured fields as the key and the whole
 * line as the value.  Both key and value are represented as Text.
 */
public class TsvInputFormat extends FileInputFormat<Text, Text> implements Configurable {

    private static final Log LOG = LogFactory.getLog(TsvInputFormat.class);

    public static final String COLUMN_KEYS_CONF = "seal.tsv-input.key-columns"; // empty selects the entire value as the key
    public static final String DELIM_CONF = "seal.tsv-input.delim";
    public static final String DELIM_DEFALT = "\t";

    protected static final Pattern RangeSelectorPatter = Pattern.compile("(\\d)-(\\d)|(\\d)");

    protected static JobContext lastContext = null;
    protected static List<InputSplit> lastResult = null;

    protected int[] keyFields = null;
    protected Configuration conf;
    protected String cachedKeyFieldSelector;

    /**
     * Scan the config parameter COLUMN_KEYS_CONF and set keyFields.
     */
    private void setupKeyFields(Configuration conf) {
        String keyFieldSelector = conf.get(COLUMN_KEYS_CONF, "");
        if (keyFieldSelector.equals(cachedKeyFieldSelector))
            return; // no need to redo the work

        ArrayList<Integer> fields = new ArrayList<Integer>();

        if (keyFieldSelector.isEmpty()) {
            LOG.info("key column(s) property not specified (" + COLUMN_KEYS_CONF
                    + ").  Using entire line as the key.");
        } else {
            String[] groups = keyFieldSelector.split(",");
            for (String g : groups) {
                Matcher m = RangeSelectorPatter.matcher(g);
                if (m.matches()) {
                    if (m.group(1) == null) // specified a simple column number
                        fields.add(Integer.parseInt(m.group(0)));
                    else {
                        int start = Integer.parseInt(m.group(1));
                        int end = Integer.parseInt(m.group(2));
                        if (start <= end) {
                            for (int i = start; i <= end; ++i)
                                fields.add(i);
                        } else
                            throw new IllegalArgumentException(
                                    "key field specification contains a range with start > end: "
                                            + keyFieldSelector);
                    }
                } else
                    throw new IllegalArgumentException(
                            "Invalid key column specification syntax " + keyFieldSelector);
            }
        }
        keyFields = new int[fields.size()];
        for (int i = 0; i < keyFields.length; ++i) {
            if (fields.get(i) <= 0)
                throw new IllegalArgumentException(
                        "Field numbers must be greater than or equal to 1 (found " + fields.get(i) + ")");
            keyFields[i] = fields.get(i) - 1;
        }

        // cache the processed keyFieldSelector value
        cachedKeyFieldSelector = keyFieldSelector;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
        setupKeyFields(conf);
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException {
        setConf(context.getConfiguration());
        return new TsvRecordReader(getConf(), keyFields);
    }

    /**
     * Implements caching getSplits.
     */
    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException {
        if (context == lastContext) {
            return lastResult;
        }
        lastContext = context;
        lastResult = super.getSplits(context);
        return lastResult;
    }

    static class TsvRecordReader extends RecordReader<Text, Text> {
        private static final Log LOG = LogFactory.getLog(TsvRecordReader.class);

        private LineRecordReader in;
        private LongWritable junk = new LongWritable();
        private Text key = new Text();
        private Text line = new Text();
        private CutText cutter;
        private StringBuilder builder;

        public TsvRecordReader(Configuration conf, int[] keyFields) throws IOException {
            in = new LineRecordReader();
            if (keyFields.length == 0) {
                cutter = null;
                builder = null;
            } else {
                cutter = new CutText(conf.get(DELIM_CONF, DELIM_DEFALT), keyFields);
                builder = new StringBuilder(1000);
            }
        }

        @Override
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
            in.initialize(split, context);
        }

        @Override
        public void close() throws IOException {
            in.close();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return in.getProgress();
        }

        @Override
        public Text getCurrentKey() throws IOException, InterruptedException {
            return key;
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return line;
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            try {
                if (in.nextKeyValue()) {
                    line = in.getCurrentValue();
                    if (cutter == null) // whole line is the key
                        key.set(line);
                    else {
                        builder.delete(0, builder.length());

                        cutter.loadRecord(line);
                        int nFields = cutter.getNumFields();
                        for (int i = 0; i < nFields; ++i)
                            builder.append(cutter.getField(i));

                        key.set(builder.toString());
                    }
                    return true;
                } else
                    return false;
            } catch (CutText.FormatException e) {
                throw new RuntimeException("format problem with line: " + line);
            }
        }
    }
}