org.warcbase.mapreduce.WacGenericInputFormatTest.java Source code

Java tutorial

Introduction

Here is the source code for org.warcbase.mapreduce.WacGenericInputFormatTest.java

Source

/*
 * Warcbase: an open-source platform for managing web archives
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.warcbase.mapreduce;

import com.google.common.io.Resources;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.warc.WARCRecord;
import org.junit.Test;
import org.warcbase.io.GenericArchiveRecordWritable;

import java.io.File;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class WacGenericInputFormatTest {
    @Test
    public void testArcInputFormat() throws Exception {
        String[] urls = new String[] { "filedesc://IAH-20080430204825-00000-blackbook.arc", "dns:www.archive.org",
                "http://www.archive.org/robots.txt", "http://www.archive.org/",
                "http://www.archive.org/index.php" };

        String arcFile = Resources.getResource("arc/example.arc.gz").getPath();

        Configuration conf = new Configuration(false);
        conf.set("fs.defaultFS", "file:///");

        File testFile = new File(arcFile);
        Path path = new Path(testFile.getAbsoluteFile().toURI());
        FileSplit split = new FileSplit(path, 0, testFile.length(), null);

        InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
                .newInstance(WacGenericInputFormat.class, conf);
        TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
        RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
                context);

        reader.initialize(split, context);

        int cnt = 0;
        while (reader.nextKeyValue()) {
            ArchiveRecord record = reader.getCurrentValue().getRecord();
            boolean isArc = record instanceof ARCRecord;
            assertTrue(isArc);

            if (isArc) {
                ARCRecord arcRecord = (ARCRecord) record;
                ARCRecordMetaData metadata = arcRecord.getMetaData();

                if (cnt < urls.length) {
                    assertEquals(urls[cnt], metadata.getUrl());
                }
            }

            cnt++;
        }
        assertEquals(300, cnt);
    }

    @Test
    public void testWarcInputFormat() throws Exception {
        String[] urls = new String[] { null, "dns:www.archive.org", "http://www.archive.org/robots.txt",
                "http://www.archive.org/robots.txt", "http://www.archive.org/robots.txt", "http://www.archive.org/",
                "http://www.archive.org/", "http://www.archive.org/", "http://www.archive.org/index.php",
                "http://www.archive.org/index.php" };

        String[] types = new String[] { "warcinfo", "response", "response", "request", "metadata", "response",
                "request", "metadata", "response", "request" };

        String arcFile = Resources.getResource("warc/example.warc.gz").getPath();

        Configuration conf = new Configuration(false);
        conf.set("fs.defaultFS", "file:///");

        File testFile = new File(arcFile);
        Path path = new Path(testFile.getAbsoluteFile().toURI());
        FileSplit split = new FileSplit(path, 0, testFile.length(), null);

        InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
                .newInstance(WacGenericInputFormat.class, conf);
        TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
        RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
                context);

        reader.initialize(split, context);

        assertTrue(urls.length == types.length);

        int cnt = 0;
        int responseCnt = 0;
        while (reader.nextKeyValue()) {
            ArchiveRecord record = reader.getCurrentValue().getRecord();
            boolean isWarc = record instanceof WARCRecord;
            assertTrue(isWarc);

            if (isWarc) {
                WARCRecord warcRecord = (WARCRecord) record;
                if (cnt < urls.length) {
                    assertEquals(urls[cnt], warcRecord.getHeader().getUrl());
                    assertEquals(types[cnt], warcRecord.getHeader().getHeaderValue("WARC-Type"));
                }

                if (warcRecord.getHeader().getHeaderValue("WARC-Type").equals("response")) {
                    responseCnt++;
                }
            }

            cnt++;
        }
        assertEquals(822, cnt);
        assertEquals(299, responseCnt);
    }
}