edu.stanford.pigir.warc.PigWarcRecord.java Source code

Java tutorial

Introduction

Here is the source code for edu.stanford.pigir.warc.PigWarcRecord.java

Source

package edu.stanford.pigir.warc;

/**
 * Container for a generic Warc Record 
 * 
 * (C) 2009 - Carnegie Mellon University
 * 
 * 1. Redistributions of this source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 * 2. The names "Lemur", "Indri", "University of Massachusetts",  
 *    "Carnegie Mellon", and "lemurproject" must not be used to 
 *    endorse or promote products derived from this software without
 *    prior written permission. To obtain permission, contact 
 *    license@lemurproject.org.
 *
 * 4. Products derived from this software may not be called "Lemur" or "Indri"
 *    nor may "Lemur" or "Indri" appear wbRecordReader their names without prior written
 *    permission of The Lemur Project. To obtain permission,
 *    contact license@lemurproject.org.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN 
 * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY 
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 * POSSIBILITY OF SUCH DAMAGE. 
 * 
 * @author mhoy@cs.cmu.edu (Mark J. Hoy)
 * 
 * Jan 17, 2011; Andreas Paepcke: added inheritance from Text
 * Jan 19, 2011; Andreas Paepcke: modified to fit wbRecordReader Hadoop/Pig workflow. 
 *                                Replaced separate header API with a 
 *                                Map<String,String> implementation that
 *                                includes 'content' as one of its fields.
 * Jan 15, 2013; Andreas Paepcke: added ability to accept multiple WARC versions. 
 *                           Search for WARC_VERSIONS, and add new ones as
 *                           appropriate.  
 */

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.io.Text;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;

public class PigWarcRecord extends Text implements WarcRecordMap {

    // Class variables:

    public static final String CONTENT = "content";
    public static final String WARC_VERSION = "warc-version";

    // Lookup table for properly capitalized ISO Warc header field
    // names. Used wbRecordReader toString();
    @SuppressWarnings("serial")
    public static final Map<String, String> ISO_WARC_HEADER_FIELD_NAMES = new HashMap<String, String>() {
        {
            put(WARC_TYPE, "WARC-Type");
            put(WARC_RECORD_ID, "WARC-Record-ID");
            put(WARC_DATE, "WARC-Date");
            put(CONTENT_LENGTH, "Content-Length");
            put(CONTENT_TYPE, "Content-Type");
            put(WARC_CONCURRENT_TO, "WARC-Concurrent-To");
            put(WARC_BLOCK_DIGEST, "WARC-Block-Digest");
            put(WARC_PAYLOAD_DIGEST, "WARC-Payload-Digest");
            put(WARC_IP_ADDRESS, "WARC-IP-Address");
            put(WARC_REFERS_TO, "WARC-Refers-To");
            put(WARC_TARGET_URI, "WARC-Target-URI");
            put(WARC_TRUNCATED, "WARC-Truncated");
            put(WARC_WARCINFO_ID, "WARC-Warcinfo-ID");
            put(WARC_FILENAME, "WARC-Filename");
            put(WARC_PROFILE, "WARC-Profile");
            put(WARC_IDENTIFIED_PAYLOAD_TYPE, "WARC-Identified-Payload-Type");
            put(WARC_SEGMENT_ORIGIN_ID, "WARC-Segment-Origin-ID");
            put(WARC_SEGMENT_NUMBER, "WARC-Segment-Number");
            put(WARC_SEGMENT_TOTAL_LENGTH, "WARC-Segment-Total-Length");
        }
    };

    // All lower-case WARC header field names:
    public static final String WARC_TYPE = "warc-type";
    public static final String WARC_RECORD_ID = "warc-record-id";
    public static final String WARC_DATE = "warc-date";
    public static final String CONTENT_LENGTH = "content-length";
    public static final String CONTENT_TYPE = "content-type";
    public static final String WARC_CONCURRENT_TO = "warc-concurrent-To";
    public static final String WARC_BLOCK_DIGEST = "warc-block-digest";
    public static final String WARC_PAYLOAD_DIGEST = "warc-payload-digest";
    public static final String WARC_IP_ADDRESS = "warc-ip-address";
    public static final String WARC_REFERS_TO = "warc-refers-to";
    public static final String WARC_TARGET_URI = "warc-target-uri";
    public static final String WARC_TRUNCATED = "warc-truncated";
    public static final String WARC_WARCINFO_ID = "warc-warcinfo-id";
    public static final String WARC_FILENAME = "warc-filename";
    public static final String WARC_PROFILE = "warc-profile";
    public static final String WARC_IDENTIFIED_PAYLOAD_TYPE = "warc-identified-payload-type";
    public static final String WARC_SEGMENT_ORIGIN_ID = "warc-segment-origin-id";
    public static final String WARC_SEGMENT_NUMBER = "warc-segment-number";
    public static final String WARC_SEGMENT_TOTAL_LENGTH = "warc-segment-total-length";

    public static final String[] mandatoryHeaderFields = { WARC_RECORD_ID, CONTENT_LENGTH, WARC_DATE, WARC_TYPE };

    // Provide a constructor for each of the header datatypes:
    private static Constructor<String> strConstructor = null;
    private static Constructor<Integer> intConstructor = null;

    {
        try {
            strConstructor = String.class.getConstructor(String.class);
            intConstructor = Integer.class.getConstructor(String.class);
        } catch (SecurityException e1) {
            e1.printStackTrace();
        } catch (NoSuchMethodException e1) {
            e1.printStackTrace();
        }
    }

    @SuppressWarnings({ "rawtypes", "serial" })
    public HashMap<String, Constructor> mandatoryWarcHeaderFldTypes = new HashMap<String, Constructor>() {
        {
            put(WARC_RECORD_ID, strConstructor);
            put(CONTENT_LENGTH, intConstructor);
            put(WARC_DATE, strConstructor);
            put(WARC_TYPE, strConstructor);
        }
    };

    public static final boolean INCLUDE_CONTENT = true;
    public static final boolean DONT_INCLUDE_CONTENT = false;

    // Marker to look for when finding the next WARC record wbRecordReader a stream:
    public static String[] WARC_VERSIONS = { "WARC/0.18", "WARC/1.0" };
    //public static String WARC_VERSION_LINE = "WARC/0.18\n";
    private static String NEWLINE = "\n";

    // Instance variables:
    protected HashMap<String, String> headerMap = null;
    protected byte[] warcContent = new byte[0];
    protected HashSet<String> optionalHeaderKeysThisRecord = new HashSet<String>();
    protected String warcVersion = null;

    public PigWarcRecord() {

    }

    /**
     * Make a WARC record from a Pig tuple
     * @throws IOException 
     */
    public PigWarcRecord(Tuple warcTuple) throws IOException {
        if (warcTuple.size() < mandatoryWarcHeaderFldTypes.size()) {
            throw new IOException("WARC tuple '" + warcTuple.toString() + "' has fewer than required fields.");
        }
        headerMap = new LinkedHashMap<String, String>();
        try {
            this.put(WARC_RECORD_ID, (String) warcTuple.get(0));
            Object contentLength = warcTuple.get(1);
            if (contentLength instanceof Integer)
                this.put(CONTENT_LENGTH, Integer.toString((Integer) contentLength));
            else if (contentLength instanceof String)
                this.put(CONTENT_LENGTH, (String) contentLength);
            this.put(WARC_DATE, (String) warcTuple.get(2));
            this.put(WARC_TYPE, (String) warcTuple.get(3));

            // The optional header fields are stored in a bag:
            if (warcTuple.size() > mandatoryWarcHeaderFldTypes.size()) {
                // Yep, got optional header bag in next tuple field:
                DataBag optionalHeaderFieldBag = (DataBag) warcTuple.get(4);
                Iterator<Tuple> optHeaderIt = optionalHeaderFieldBag.iterator();
                String headFldName = null;
                String headFldVal = null;
                while (optHeaderIt.hasNext()) {
                    Tuple headFldNameVal = optHeaderIt.next();

                    try {
                        headFldName = (String) headFldNameVal.get(0);
                        headFldVal = (String) headFldNameVal.get(1);
                    } catch (Exception e) {
                        throw new IOException("Error extracting optional WARC header fields from WARC tuple '"
                                + warcTuple.toString() + "':" + e.getMessage());
                    }
                    this.put(headFldName, headFldVal);
                }
            }
            // If tuple also has content field, get it into the new WarcRecord as well:
            if (warcTuple.size() > mandatoryWarcHeaderFldTypes.size() + 1) {
                try {
                    Object rawContent = warcTuple.get(5);
                    if (!(rawContent instanceof DataByteArray))
                        throw new IOException(
                                "WARC file content fields must be declared as 'bytearray' in Pig scripts. Otherwise binary content, like images get destroyed.");
                    String content = rawContent.toString();
                    this.put(CONTENT, content);
                } catch (Exception e) {
                    throw new IOException("Error during reading of content field from WARC tuple '"
                            + warcTuple.toString() + "':" + e.getMessage());
                }
            }
        } catch (IOException e) {
            throw e;
        } catch (Exception e) {
            throw new IOException(
                    "Error while processing WARC tuple '" + warcTuple.toString() + "':" + e.getMessage());
        }
    }

    /**
     * Retrieves the bytes content as a UTF-8 string
     * @return
     */
    public String getContentUTF8() {
        String retString = null;
        try {
            retString = new String(warcContent, "UTF-8");
        } catch (UnsupportedEncodingException ex) {
            retString = new String(warcContent);
        }
        return retString;
    }

    @Override
    public String toString() {
        return toString(DONT_INCLUDE_CONTENT);
    }

    public String toString(boolean shouldIncludeContent) {
        StringBuffer retBuffer = new StringBuffer();
        String headerVal;
        for (String headerFldNm : headerMap.keySet()) {
            String officialName = ISO_WARC_HEADER_FIELD_NAMES.get(headerFldNm);
            if (officialName == null)
                // Non-official WARC header name: just use it directly:
                officialName = headerFldNm;
            retBuffer.append(officialName + ":"
                    + ((headerVal = headerMap.get(headerFldNm)) == null ? "" : headerVal) + "\n");
        }
        if (shouldIncludeContent) {
            retBuffer.append(NEWLINE);
            retBuffer.append(getContentUTF8());
        } else
            retBuffer.append(
                    "[Record content suppressed. Use toString(INCLUDE_CONTENT) to see the content string.\n");
        return retBuffer.toString();
    }

    /**
     * Returns content as byte array.
     * @return
     */
    public byte[] getContentRaw() {
        return warcContent;
    }

    //  -----------------------------------  MAP<String,String> Methods -----------------------

    public int size() {
        // Plus 1 is for the pseudo 'content' byte array
        // that's not really part of the hash:
        return headerMap.size() + 1;
    }

    public boolean isEmpty() {
        return headerMap.isEmpty() && (warcContent.length == 0);
    }

    public boolean containsKey(Object key) {
        String lowerCaseKey = ((String) key).toLowerCase();
        return (headerMap.containsKey(lowerCaseKey) || lowerCaseKey.equals(CONTENT)
                || lowerCaseKey.equals(WARC_VERSION));
    }

    public boolean containsValue(Object value) {
        if (headerMap.containsValue(value))
            return true;
        String content = getContentUTF8();
        return content.contains((String) value);
    }

    public String get(Object key) {
        if (((String) key).equalsIgnoreCase(CONTENT)) {
            return getContentUTF8();
        }
        if (((String) key).equalsIgnoreCase(WARC_VERSION)) {
            return warcVersion;
        }
        return headerMap.get(((String) key).toLowerCase());
    }

    public String put(String key, String value) {
        String prevValue;
        String lowerCaseKey = key.toLowerCase();
        if (lowerCaseKey.equals(CONTENT)) {
            prevValue = getContentUTF8();
            warcContent = value.getBytes();
            return prevValue;
        }
        if (lowerCaseKey.equals(WARC_VERSION)) {
            prevValue = warcVersion;
            warcVersion = value;
            return prevValue;
        }
        prevValue = headerMap.get(lowerCaseKey);
        headerMap.put(lowerCaseKey, value);
        return prevValue;
    }

    public String remove(Object key) {
        String prevValue;
        String lowerCaseKey = ((String) key).toLowerCase();
        if (lowerCaseKey.equalsIgnoreCase(CONTENT)) {
            prevValue = getContentUTF8();
            warcContent = new byte[0];
            return prevValue;
        }
        if (lowerCaseKey.equalsIgnoreCase(WARC_VERSION)) {
            prevValue = warcVersion;
            warcVersion = null;
            return prevValue;
        }
        return headerMap.remove(lowerCaseKey);
    }

    public void putAll(Map<? extends String, ? extends String> m) {
        for (String key : m.keySet()) {
            put(key, m.get(key));
        }
    }

    public Set<String> keySet() {
        Set<String> res = headerMap.keySet();
        res.add(CONTENT);
        res.add(WARC_VERSION);
        return res;
    }

    public Set<String> keySetHeader() {
        return headerMap.keySet();
    }

    public String[] mandatoryKeysHeader() {
        return mandatoryHeaderFields;
    }

    public Set<String> optionalKeysHeader() {
        return optionalHeaderKeysThisRecord;
    }

    public String[] mandatoryValuesHeader() {
        String[] res = new String[mandatoryHeaderFields.length];
        for (int i = 0; i < mandatoryHeaderFields.length; i++) {
            res[i] = get(mandatoryHeaderFields[i]);
        }
        return res;
    }

    public Collection<String> values() {
        Collection<String> res = headerMap.values();
        res.add(getContentUTF8());
        return res;
    }

    public Collection<String> valuesHeader() {
        return headerMap.values();
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    public Set entrySet() {
        return entrySet(true);
    }

    public Set<Entry<String, String>> entrySet(boolean readContent) {
        //Set<Entry> res = new HashSet<Entry>();
        HashSet<Entry<String, String>> res = new HashSet<Entry<String, String>>();
        for (Map.Entry<String, String> headerMapEntry : headerMap.entrySet()) {
            res.add(new Entry<String, String>(headerMapEntry.getKey(), headerMapEntry.getValue()));
        }
        if (readContent) {
            res.add(new Entry<String, String>(CONTENT, getContentUTF8()));
        }
        res.add(new Entry<String, String>(WARC_VERSION, warcVersion));
        return res;
    }

    private class Entry<K, V> implements Map.Entry<K, V> {

        K key;
        V value;

        public Entry(K theKey, V theValue) {
            key = theKey;
            value = theValue;
        }

        public K getKey() {
            return key;
        }

        public V getValue() {
            return value;
        }

        public V setValue(V theValue) {
            V oldVal = value;
            value = theValue;
            return oldVal;
        }

        @SuppressWarnings("unchecked")
        public boolean equals(Object obj) {
            if (!obj.getClass().equals(this.getClass()))
                return false;
            return (((Entry<K, V>) obj).getKey().equals(key) && ((Entry<K, V>) obj).getValue().equals(value));
        }

        public int hashCode() {
            return ((key == null ? 0 : key.hashCode()) ^ (value == null ? 0 : value.hashCode()));
        }

        public String toString() {
            return new String(key + "=" + value);
        }
    }
}