org.warcbase.io.GenericArchiveRecordWritableTest.java Source code

Java tutorial

Introduction

Here is the source code for org.warcbase.io.GenericArchiveRecordWritableTest.java

Source

/*
 * Warcbase: an open-source platform for managing web archives
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.warcbase.io;

import com.google.common.io.Resources;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.ivy.osgi.updatesite.xml.Archive;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.junit.Test;
import org.warcbase.io.GenericArchiveRecordWritable.ArchiveFormat;
import org.warcbase.mapreduce.WacArcInputFormat;
import org.warcbase.mapreduce.WacGenericInputFormat;

import java.io.*;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class GenericArchiveRecordWritableTest {
    @Test
    public void testArcInputFormat() throws Exception {
        String arcFile = Resources.getResource("arc/example.arc.gz").getPath();

        Configuration conf = new Configuration(false);
        conf.set("fs.defaultFS", "file:///");

        File testFile = new File(arcFile);
        Path path = new Path(testFile.getAbsoluteFile().toURI());
        FileSplit split = new FileSplit(path, 0, testFile.length(), null);

        InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
                .newInstance(WacGenericInputFormat.class, conf);
        TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
        RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
                context);

        reader.initialize(split, context);

        int cnt = 0;
        while (reader.nextKeyValue()) {
            GenericArchiveRecordWritable record = reader.getCurrentValue();
            cnt++;

            ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
            DataOutputStream dataOut = new DataOutputStream(bytesOut);

            record.write(dataOut);

            GenericArchiveRecordWritable reconstructed = new GenericArchiveRecordWritable();

            reconstructed.setFormat(ArchiveFormat.ARC);
            reconstructed.readFields(new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray())));

            boolean isArc = (record.getFormat() == ArchiveFormat.ARC);
            assertEquals(isArc, true);
            if (isArc) {
                assertEquals(((ARCRecord) record.getRecord()).getMetaData().getUrl(),
                        ((ARCRecord) reconstructed.getRecord()).getMetaData().getUrl());
            }
        }

        assertEquals(300, cnt);
    }

    @Test
    public void testWarcInputFormat() throws Exception {
        String warcFile = Resources.getResource("warc/example.warc.gz").getPath();

        Configuration conf = new Configuration(false);
        conf.set("fs.defaultFS", "file:///");

        File testFile = new File(warcFile);
        Path path = new Path(testFile.getAbsoluteFile().toURI());
        FileSplit split = new FileSplit(path, 0, testFile.length(), null);

        InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils
                .newInstance(WacGenericInputFormat.class, conf);
        TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
        RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split,
                context);

        reader.initialize(split, context);

        int cnt = 0;
        while (reader.nextKeyValue()) {
            GenericArchiveRecordWritable record = reader.getCurrentValue();

            cnt++;

            ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
            DataOutputStream dataOut = new DataOutputStream(bytesOut);

            record.write(dataOut);

            GenericArchiveRecordWritable reconstructed = new GenericArchiveRecordWritable();

            reconstructed.setFormat(ArchiveFormat.WARC);
            reconstructed.readFields(new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray())));

            boolean isWarc = (record.getFormat() == ArchiveFormat.WARC);
            assertTrue(isWarc);
            if (isWarc) {
                assertEquals(record.getRecord().getHeader().getUrl(),
                        reconstructed.getRecord().getHeader().getUrl());
                assertEquals(record.getRecord().getHeader().getContentLength(),
                        reconstructed.getRecord().getHeader().getContentLength());
            }
        }

        assertEquals(822, cnt);
    }
}