org.apache.nutch.protocol.Content.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.protocol.Content.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.protocol;

//JDK imports
import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.zip.InflaterInputStream;

//Hadoop imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ArrayFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.GenericOptionsParser;

//Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;

public final class Content implements Writable {

    public static final String DIR_NAME = "content";

    private final static int VERSION = -1;

    private int version;

    private String url;

    private String base;

    private byte[] content;

    private String contentType;

    private Metadata metadata;

    private MimeUtil mimeTypes;

    public Content() {
        metadata = new Metadata();
    }

    public Content(String url, String base, byte[] content, String contentType, Metadata metadata,
            Configuration conf) {

        if (url == null)
            throw new IllegalArgumentException("null url");
        if (base == null)
            throw new IllegalArgumentException("null base");
        if (content == null)
            throw new IllegalArgumentException("null content");
        if (metadata == null)
            throw new IllegalArgumentException("null metadata");

        this.url = url;
        this.base = base;
        this.content = content;
        this.metadata = metadata;

        this.mimeTypes = new MimeUtil(conf);
        this.contentType = getContentType(contentType, url, content);
    }

    public Content(String url, String base, byte[] content, String contentType, Metadata metadata,
            MimeUtil mimeTypes) {

        if (url == null)
            throw new IllegalArgumentException("null url");
        if (base == null)
            throw new IllegalArgumentException("null base");
        if (content == null)
            throw new IllegalArgumentException("null content");
        if (metadata == null)
            throw new IllegalArgumentException("null metadata");

        this.url = url;
        this.base = base;
        this.content = content;
        this.metadata = metadata;

        this.mimeTypes = mimeTypes;
        this.contentType = getContentType(contentType, url, content);
    }

    private final void readFieldsCompressed(DataInput in) throws IOException {
        byte oldVersion = in.readByte();
        switch (oldVersion) {
        case 0:
        case 1:
            url = Text.readString(in); // read url
            base = Text.readString(in); // read base

            content = new byte[in.readInt()]; // read content
            in.readFully(content);

            contentType = Text.readString(in); // read contentType
            // reconstruct metadata
            int keySize = in.readInt();
            String key;
            for (int i = 0; i < keySize; i++) {
                key = Text.readString(in);
                int valueSize = in.readInt();
                for (int j = 0; j < valueSize; j++) {
                    metadata.add(key, Text.readString(in));
                }
            }
            break;
        case 2:
            url = Text.readString(in); // read url
            base = Text.readString(in); // read base

            content = new byte[in.readInt()]; // read content
            in.readFully(content);

            contentType = Text.readString(in); // read contentType
            metadata.readFields(in); // read meta data
            break;
        default:
            throw new VersionMismatchException((byte) 2, oldVersion);
        }

    }

    public final void readFields(DataInput in) throws IOException {
        metadata.clear();
        int sizeOrVersion = in.readInt();
        if (sizeOrVersion < 0) { // version
            version = sizeOrVersion;
            switch (version) {
            case VERSION:
                url = Text.readString(in);
                base = Text.readString(in);

                content = new byte[in.readInt()];
                in.readFully(content);

                contentType = Text.readString(in);
                metadata.readFields(in);
                break;
            default:
                throw new VersionMismatchException((byte) VERSION, (byte) version);
            }
        } else { // size
            byte[] compressed = new byte[sizeOrVersion];
            in.readFully(compressed, 0, compressed.length);
            ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
            DataInput inflater = new DataInputStream(new InflaterInputStream(deflated));
            readFieldsCompressed(inflater);
        }
    }

    public final void write(DataOutput out) throws IOException {
        out.writeInt(VERSION);

        Text.writeString(out, url); // write url
        Text.writeString(out, base); // write base

        out.writeInt(content.length); // write content
        out.write(content);

        Text.writeString(out, contentType); // write contentType

        metadata.write(out); // write metadata
    }

    public static Content read(DataInput in) throws IOException {
        Content content = new Content();
        content.readFields(in);
        return content;
    }

    //
    // Accessor methods
    //

    /** The url fetched. */
    public String getUrl() {
        return url;
    }

    /**
     * The base url for relative links contained in the content. Maybe be
     * different from url if the request redirected.
     */
    public String getBaseUrl() {
        return base;
    }

    /** The binary content retrieved. */
    public byte[] getContent() {
        return content;
    }

    public void setContent(byte[] content) {
        this.content = content;
    }

    /**
     * The media type of the retrieved content.
     * 
     * @see <a href="http://www.iana.org/assignments/media-types/">
     *      http://www.iana.org/assignments/media-types/</a>
     */
    public String getContentType() {
        return contentType;
    }

    public void setContentType(String contentType) {
        this.contentType = contentType;
    }

    /** Other protocol-specific data. */
    public Metadata getMetadata() {
        return metadata;
    }

    /** Other protocol-specific data. */
    public void setMetadata(Metadata metadata) {
        this.metadata = metadata;
    }

    public boolean equals(Object o) {
        if (!(o instanceof Content)) {
            return false;
        }
        Content that = (Content) o;
        return this.url.equals(that.url) && this.base.equals(that.base)
                && Arrays.equals(this.getContent(), that.getContent()) && this.contentType.equals(that.contentType)
                && this.metadata.equals(that.metadata);
    }

    public String toString() {
        StringBuffer buffer = new StringBuffer();

        buffer.append("Version: " + version + "\n");
        buffer.append("url: " + url + "\n");
        buffer.append("base: " + base + "\n");
        buffer.append("contentType: " + contentType + "\n");
        buffer.append("metadata: " + metadata + "\n");
        buffer.append("Content:\n");
        buffer.append(new String(content)); // try default encoding

        return buffer.toString();

    }

    public static void main(String args[]) throws Exception {

        String usage = "Content (-local | -dfs <namenode:port>) recno batchId";

        if (args.length < 3) {
            System.out.println("usage:" + usage);
            return;
        }

        GenericOptionsParser optParser = new GenericOptionsParser(NutchConfiguration.create(), args);
        String[] argv = optParser.getRemainingArgs();
        Configuration conf = optParser.getConfiguration();

        FileSystem fs = FileSystem.get(conf);
        try {
            int recno = Integer.parseInt(argv[0]);
            String batchId = argv[1];

            Path file = new Path(batchId, DIR_NAME);
            System.out.println("Reading from file: " + file);

            ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf);

            Content content = new Content();
            contents.get(recno, content);
            System.out.println("Retrieved " + recno + " from file " + file);

            System.out.println(content);

            contents.close();
        } finally {
            fs.close();
        }
    }

    private String getContentType(String typeName, String url, byte[] data) {
        return this.mimeTypes.autoResolveContentType(typeName, url, data);
    }

}