edu.isi.mavuno.input.XMLInputFormat.java Source code

Introduction

Here is the source code for edu.isi.mavuno.input.XMLInputFormat.java
Source

/*
 * Cloud9: A MapReduce Library for Hadoop
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.isi.mavuno.input;

import java.io.DataInputStream;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

// solution for reading XML files, posted to the Hadoop users mailing list
// Re: map/reduce function on xml string - Colin Evans-2 Mar 04, 2008; 02:27pm
public class XMLInputFormat extends TextInputFormat {
    public static final String START_TAG_KEY = "xmlinput.start";
    public static final String END_TAG_KEY = "xmlinput.end";

    @Override
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
        Configuration conf = context.getConfiguration();
        try {
            return new XMLRecordReader((FileSplit) split, conf);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static class XMLRecordReader extends RecordReader<LongWritable, Text> {
        private final LongWritable mCurKey = new LongWritable();
        private final Text mCurValue = new Text();

        private byte[] startTag;
        private byte[] endTag;
        private long start;
        private long end;
        private long pos;
        private DataInputStream fsin = null;
        private final DataOutputBuffer buffer = new DataOutputBuffer();

        private long recordStartPos;

        public XMLRecordReader(FileSplit split, Configuration conf) throws IOException {
            if (conf.get(START_TAG_KEY) == null || conf.get(END_TAG_KEY) == null)
                throw new RuntimeException("Error! XML start and end tags unspecified!");

            startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
            endTag = conf.get(END_TAG_KEY).getBytes("utf-8");

            start = split.getStart();
            Path file = split.getPath();

            CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
            CompressionCodec codec = compressionCodecs.getCodec(file);

            FileSystem fs = file.getFileSystem(conf);

            if (codec != null) {
                fsin = new DataInputStream(codec.createInputStream(fs.open(file)));
                end = Long.MAX_VALUE;
            } else {
                FSDataInputStream fileIn = fs.open(file);

                fileIn.seek(start);
                fsin = fileIn;

                end = start + split.getLength();
            }

            recordStartPos = start;

            // Because input streams of gzipped files are not seekable
            // (specifically, do not support getPos), we need to keep
            // track of bytes consumed ourselves.
            pos = start;
        }

        @Override
        public void close() throws IOException {
            fsin.close();
        }

        @Override
        public float getProgress() throws IOException {
            return ((float) (pos - start)) / ((float) (end - start));
        }

        private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
            int i = 0;
            while (true) {
                int b = fsin.read();
                // increment position (bytes consumed)
                pos++;

                // end of file:
                if (b == -1)
                    return false;
                // save to buffer:
                if (withinBlock)
                    buffer.write(b);

                // check if we're matching:
                if (b == match[i]) {
                    i++;
                    if (i >= match.length)
                        return true;
                } else
                    i = 0;
                // see if we've passed the stop point:
                if (!withinBlock && i == 0 && pos >= end)
                    return false;
            }
        }

        @Override
        public LongWritable getCurrentKey() throws IOException, InterruptedException {
            return mCurKey;
        }

        @Override
        public Text getCurrentValue() throws IOException, InterruptedException {
            return mCurValue;
        }

        @Override
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if (pos < end) {
                if (readUntilMatch(startTag, false)) {
                    recordStartPos = pos - startTag.length;

                    try {
                        buffer.write(startTag);
                        if (readUntilMatch(endTag, true)) {
                            mCurKey.set(recordStartPos);
                            mCurValue.set(buffer.getData(), 0, buffer.getLength());
                            return true;
                        }
                    } finally {
                        buffer.reset();
                    }
                }
            }
            return false;
        }
    }
}