gov.llnl.ontology.text.hbase.XMLRecordReader.java Source code

Introduction

Here is the source code for gov.llnl.ontology.text.hbase.XMLRecordReader.java
Source

/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at
 * the Lawrence Livermore National Laboratory. Written by Keith Stevens,
 * kstevens@cs.ucla.edu OCEC-10-073 All rights reserved. 
 *
 * This file is part of the C-Cat package and is covered under the terms and
 * conditions therein.
 *
 * The C-Cat package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package gov.llnl.ontology.text.hbase;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hbase.io.ImmutableBytesWritable;

import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

/**
 * A {@link RecordReader} for processing gzipped tarballs of document files.
 * It is assumed that each tarballed file is a single document, or will be
 * processed further by other stages.
 * @author Keith Stevens
 */
public class XMLRecordReader extends RecordReader<ImmutableBytesWritable, Text> {

    public static final String CONF_PREFIX = "gov.llnl.ontology.text.hbase.XMLRecordReader";

    public static final String DELIMITER_TAG = CONF_PREFIX + ".tag";

    /**
     * The current {@link ImmutableBytesWritable} key read.
     */
    private ImmutableBytesWritable currentKey;

    /**
     * The current {@link Text} document.
     */
    private Text currentDocument;

    /**
     * The tag that begins a single XML document.
     */
    private byte[] startTag;

    /**
     * The tag that ends a single XML document.
     */
    private byte[] endTag;

    /**
     * A file stream for reading xml data that needs to be partitioned.
     */
    private InputStream fsin;

    /**
     * The start byte position of the current record.
     */
    private long start;

    /**
     * The end byte position of the current record.
     */
    private long end;

    /**
     * The current position in the file stream.
     */
    private long pos;

    /**
     * An output buffer for storing characters that will compose a single
     * record.
     */
    private final DataOutputBuffer buffer = new DataOutputBuffer();

    /**
     * Set to true if the xml files are gzipped.
     */
    private final boolean useGzip;

    /**
     * Creates a new {@link XMLRecordReader} without gzipped files.
     */
    public XMLRecordReader() {
        this(false);
    }

    /**
     * Creates a new {@link XMLRecordReader} with {@code useGzip} set to true if
     * the files are in a gzip format.
     */
    public XMLRecordReader(boolean useGzip) {
        this.useGzip = useGzip;
    }

    /**
     * Extract the {@link Path} for the file to be processed by this {@link
     * XMLRecordReader}.
     */
    public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException {
        Configuration config = context.getConfiguration();

        // Get the file stream for the xml file.
        FileSplit split = (FileSplit) isplit;
        Path file = split.getPath();
        FileSystem fs = file.getFileSystem(config);
        fsin = (useGzip) ? new GZIPInputStream(fs.open(split.getPath())) : fs.open(split.getPath());
        fsin = new BufferedInputStream(fsin);

        // Setup the limits of the xml file.
        start = split.getStart();
        end = start + split.getLength();
        pos = 0;

        // Get the xml document delmiters for this xml file.
        if (!config.get(DELIMITER_TAG).equals("")) {
            startTag = ("<" + config.get(DELIMITER_TAG)).getBytes();
            endTag = ("</" + config.get(DELIMITER_TAG) + ">").getBytes();
        } else {
            String fileNameBase = file.getName().replace(".xml", "");
            startTag = ("<" + fileNameBase).getBytes();
            endTag = ("</" + fileNameBase).getBytes();
        }
        context.setStatus(file.getName() + " " + pos + " " + end);
    }

    /**
     * Advances the reader one step to point to the next tarball file.  It
     * returns {@code null} when there are no more files in the tarball.
     */
    public boolean nextKeyValue() throws IOException {
        currentKey = new ImmutableBytesWritable();
        currentDocument = new Text();
        buffer.reset();

        if (pos < end) {
            if (readUntilMatch(startTag, false)) {
                // Sometimes our start tag is a subset of another outer tag.
                // To ensure that we have the correct tag, check that the
                // next byte is a space or a '>' which ends the tag itself.
                // If it's neither of those, recursively try to find the
                // next start tag.
                int b = fsin.read();
                if (b != ' ' && b != '>')
                    return nextKeyValue();

                // Write the start tag.
                buffer.write(startTag);
                buffer.write(b);

                // Read the record into the buffer.
                if (readUntilMatch(endTag, true)) {
                    // Write the key and value for this record.
                    currentKey.set(Long.toString(pos).getBytes());
                    currentDocument.set(buffer.getData(), 0, buffer.getLength());
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * {@inheritDoc}
     */
    public ImmutableBytesWritable getCurrentKey() {
        return currentKey;
    }

    /**
     * {@inheritDoc}
     */
    public Text getCurrentValue() {
        return currentDocument;
    }

    /**
     * {@inheritDoc}
     */
    public float getProgress() throws IOException, InterruptedException {
        return (pos - start) / (float) (end - start);
    }

    /**
     * {@inheritDoc}
     */
    public void close() throws IOException {
        fsin.close();
    }

    /**
     * Reads characters from the file stream until a set of characters match
     * the text in {@code match}.  Returns true if a valid match was found
     * and false if the end of file was reached but no valid match was
     * found.  If {@code withinBlock} is true, characters read will be
     * stored in {@code buffer}.
     */
    private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
        int i = 0;
        while (true) {
            // Read the next byte.
            int b = fsin.read();
            pos++;

            // Check for end of file.
            if (b == -1)
                return false;

            // Save to the buffer.
            if (withinBlock)
                buffer.write(b);

            // Check if we're matching:
            if (b == match[i]) {
                i++;
                if (i >= match.length)
                    return true;
            } else
                i = 0;

            // Return false if we're still reading a record but the file has
            // ended.
            if (!withinBlock && i == 0 && pos >= end)
                return false;
        }
    }
}